001/* 002 * $URL: svn://svn.webarts.bc.ca/open/trunk/projects/WebARTS/ca/bc/webarts/tools/UrlScraper.java $ 003 * $Author: tgutwin $ 004 * $Revision: 1255 $ 005 * $Date: 2018-03-17 20:34:04 -0700 (Sat, 17 Mar 2018) $ 006 */ 007/* 008 * $Rev: 1255 $: Revision of last commit 009 * $Author: tgutwin $: Author of last commit 010 * $Date: 2018-03-17 20:34:04 -0700 (Sat, 17 Mar 2018) $: Date of last commit 011 * Copyright (C) 2017-2018 WebARTS Design, 012 * North Vancouver Canada. All Rights Reserved. 013 * 014 * Written by Tom Gutwin - WebARTS Design. 015 * http://www.webarts.bc.ca 016 * 017 * This program is free software; you can redistribute it and/or modify 018 * it under the terms of the GNU General Public License as published by 019 * the Free Software Foundation; either version 2 of the License, or 020 * (at your option) any later version. 021 * 022 * This program is distributed in the hope that it will be useful, 023 * but WITHOUT ANY WARRANTY; without even the implied warranty of 024 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 025 * GNU General Public License for more details. 026 * 027 * You should have received a copy of the GNU General Public License 028 * along with this program; if not, write to the Free Software 029 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 030 */ 031package ca.bc.webarts.tools; 032 033 034// import ca.bc.webarts.tools.MyCookieHandler; 035import java.io.BufferedReader; 036import java.io.DataOutputStream; 037import java.io.File; 038import java.io.FileNotFoundException; 039import java.io.FileOutputStream; 040import java.io.FileReader; 041import java.io.IOException; 042import java.io.InputStreamReader; 043import java.io.StringReader; 044import java.io.UnsupportedEncodingException; 045import java.net.CookieHandler; 046import java.net.CookieManager; 047import java.net.MalformedURLException; 048import java.net.ProtocolException; 049import java.net.URL; 050import java.net.URLEncoder; 051import java.util.ArrayList; 052import java.util.Calendar; 053import java.util.HashMap; 054import java.util.List; 055 056import javax.json.*; 057import javax.net.ssl.HttpsURLConnection; 058 059import org.jsoup.Jsoup; 060import org.jsoup.nodes.Document; 061import org.jsoup.nodes.Element; 062import org.jsoup.select.Elements; 063 064 065/** 066 * A very basic tool using JSoup to login to a webPage, get authniticated and then load another page. 067 **/ 068public class UrlScraper 069{ 070 071 /** A holder for this clients System File Separator. */ 072 public static final String SYSTEM_FILE_SEPERATOR = File.separator; 073 /** A holder for this clients System line termination separator. */ 074 public static final String SYSTEM_LINE_SEPERATOR = 075 System.getProperty("line.separator"); 076 protected static final String USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"; 077 protected static Calendar rightNow_ = Calendar.getInstance(); 078 protected String dateStr_ = rightNow_.get(rightNow_.YEAR) + "-" + (rightNow_.get(rightNow_.MONTH) + 1 < 10 ? "0" + rightNow_.get(rightNow_.MONTH) + 1 : String.valueOf(rightNow_.get(rightNow_.MONTH) + 1)) + "-" + (rightNow_.get(rightNow_.DAY_OF_MONTH) < 10 ? "0" + rightNow_.get(rightNow_.DAY_OF_MONTH) : rightNow_.get(rightNow_.DAY_OF_MONTH)); 079 protected boolean debugOut_ = false; 080 protected MyCookieHandler cm_ = new MyCookieHandler(); 081 protected List<String> cookies_; 082 protected HttpsURLConnection connection_; 083 protected String baseUrl_ = "/"; 084 protected boolean alreadyLoggedIn_ = false; 085 private String loginUrl_ = ""; 086 /** Cache of the response text returned from the login page post. **/ 087 protected String postPageResponse_ = ""; 088 private HashMap<String, String> requestProps_ = new HashMap<String, String>(); 089 private String loginFormID_ = ""; 090 private String usernameFormElementName_ = ""; 091 private String passwordFormElementName_ = ""; 092 private String username_ = ""; 093 private String password_ = ""; 094 private String scrapePageUrl_ = ""; 095 private String scrapeStart_ = ""; 096 private String scrapeEnd_ = ""; 097 098 /** Holds the most recent page scrape UNchopped. **/ 099 private String pageContentCache_ = ""; 100 protected String stockSymbolCache_ = ""; 101 protected String marketSymbolCache_ = ""; 102 103 104 /** default constructor does nothing. **/ 105 public UrlScraper() 106 { 107 // make sure cookies are turn on 108 CookieHandler.setDefault(new CookieManager()); 109 } 110 111 112 /** All In One constructor. **/ 113 public UrlScraper(String loginUrl, 114 HashMap<String, String> requestProps, 115 String loginFormID, 116 String usernameFormElementName, 117 String passwordFormElementName, 118 String username, 119 String password, 120 String scrapePageUrl, 121 String scrapeStart, 122 String scrapeEnd) 123 { 124 loginUrl_ = loginUrl;requestProps_ = requestProps;loginFormID_ = loginFormID;usernameFormElementName_ = usernameFormElementName; 125 passwordFormElementName_ = passwordFormElementName;username_ = username;password_ = password;scrapePageUrl_ = scrapePageUrl; 126 scrapeStart_ = scrapeStart;scrapeEnd_ = scrapeEnd; 127 128 // make sure cookies are turn on 129 CookieHandler.setDefault(new CookieManager()); 130 } 131 132 133 /** 134 * Set Method for class field 'debugOut_' to true. Turns on extra debugging System.out stuff. 135 * 136 **/ 137 public void setDebugOut() 138 { 139 this.debugOut_ = true; 140 } // setDebugOut_ Method 141 142 143 /** 144 * Set Method for class field 'debugOut_'. Turns on/off extra debugging System.out stuff. 145 * 146 * @param debugOut_ is the value to set this class field to. 147 * 148 **/ 149 public void setDebugOut(boolean debugOut) 150 { 151 this.debugOut_ = debugOut; 152 } // setDebugOut_ Method 153 154 155 /** 156 * Get Method for class field 'debugOut_'. 157 * 158 * @return boolean - The value the class field 'debugOut_'. 159 * 160 **/ 161 public boolean getDebugOut() 162 { 163 return debugOut_; 164 } // getDebugOut Method 165 166 167 /** 168 * Set Method for class field 'baseUrl_'. 169 * 170 * @param baseUrl_ is the value to set this class field to. 171 * 172 **/ 173 public void setBaseUrl(String baseUrl) 174 { 175 this.baseUrl_ = baseUrl; 176 } // setBaseUrl Method 177 178 179 /** 180 * Get Method for class field 'baseUrl_'. 181 * 182 * @return String - The value the class field 'baseUrl_'. 183 * 184 **/ 185 public String getBaseUrl() 186 { 187 return baseUrl_; 188 } // getBaseUrl Method 189 190 191 /** 192 * Set Method for class field 'loginUrl_'. 193 * 194 * @param loginUrl is the value to set this class field to. 195 * 196 **/ 197 public void setLoginUrl(String loginUrl) 198 { 199 this.loginUrl_ = loginUrl; 200 } // setLoginUrl Method 201 202 203 /** 204 * Get Method for class field 'loginUrl_'. 205 * 206 * @return String - The value the class field 'loginUrl_'. 207 * 208 **/ 209 public String getLoginUrl() 210 { 211 return loginUrl_; 212 } // getLoginUrl Method 213 214 215 /** 216 * Set Method for class field 'requestProps_'. 217 * 218 * @param requestProps is the value to set this class field to. 219 * 220 **/ 221 public void setRequestProps(HashMap<String, String> requestProps) 222 { 223 this.requestProps_ = requestProps; 224 } // setRequestProps Method 225 226 227 /** 228 * Get Method for class field 'requestProps_'. 229 * 230 * @return HashMap<String, String> - The value the class field 'requestProps_'. 231 * 232 **/ 233 public HashMap<String, String> getRequestProps() 234 { 235 return requestProps_; 236 } // getRequestProps Method 237 238 239 /** 240 * Set Method for class field 'loginFormID_'. 241 * 242 * @param loginFormID is the value to set this class field to. 243 * 244 **/ 245 public void setLoginFormID(String loginFormID) 246 { 247 this.loginFormID_ = loginFormID; 248 } // setLoginFormID Method 249 250 251 /** 252 * Get Method for class field 'loginFormID_'. 253 * 254 * @return String - The value the class field 'loginFormID_'. 255 * 256 **/ 257 public String getLoginFormID() 258 { 259 return loginFormID_; 260 } // getLoginFormID Method 261 262 263 /** 264 * Set Method for class field 'usernameFormElementName_'. 265 * 266 * @param usernameFormElementName_ is the value to set this class field to. 267 * 268 **/ 269 public void setUsernameFormElementName(String usernameFormElementName) 270 { 271 this.usernameFormElementName_ = usernameFormElementName; 272 } // setUsernameFormElementName_ Method 273 274 275 /** 276 * Get Method for class field 'usernameFormElementName_'. 277 * 278 * @return String - The value the class field 'usernameFormElementName_'. 279 * 280 **/ 281 public String getUsernameFormElementName() 282 { 283 return usernameFormElementName_; 284 } // getUsernameFormElementName Method 285 286 287 /** 288 * Set Method for class field 'passwordFormElementName_'. 289 * 290 * @param passwordFormElementName_ is the value to set this class field to. 291 * 292 **/ 293 public void setPasswordFormElementName(String passwordFormElementName) 294 { 295 this.passwordFormElementName_ = passwordFormElementName; 296 } // setPasswordFormElementName Method 297 298 299 /** 300 * Get Method for class field 'passwordFormElementName_'. 301 * 302 * @return String - The value the class field 'passwordFormElementName_'. 303 * 304 **/ 305 public String getPasswordFormElementName() 306 { 307 return passwordFormElementName_; 308 } // getPasswordFormElementName Method 309 310 311 /** 312 * Set Method for class field 'username_'. 313 * 314 * @param username_ is the value to set this class field to. 315 * 316 **/ 317 public void setUsername(String username) 318 { 319 this.username_ = username; 320 } // setUsername Method 321 322 323 /** 324 * Get Method for class field 'username_'. 325 * 326 * @return String - The value the class field 'username_'. 327 * 328 **/ 329 public String getUsername() 330 { 331 return username_; 332 } // getUsername Method 333 334 335 /** 336 * Set Method for class field 'password_'. 337 * 338 * @param password_ is the value to set this class field to. 339 * 340 **/ 341 public void setPassword(String password) 342 { 343 this.password_ = password; 344 } // setPassword Method 345 346 347 /** 348 * Get Method for class field 'password_'. 349 * 350 * @return String - The value the class field 'password_'. 351 * 352 **/ 353 public String getPassword() 354 { 355 return password_; 356 } // getPassword Method 357 358 359 /** 360 * Set Method for class field 'scrapePageUrl_'. 361 * 362 * @param scrapePageUrl_ is the value to set this class field to. 363 * 364 **/ 365 public void setScrapePageUrl(String scrapePageUrl) 366 { 367 this.scrapePageUrl_ = scrapePageUrl; 368 } // setScrapePageUrl_ Method 369 370 371 /** 372 * Get Method for class field 'scrapePageUrl_'. 373 * 374 * @return String - The value the class field 'scrapePageUrl_'. 375 * 376 **/ 377 public String getScrapePageUrl() 378 { 379 return scrapePageUrl_; 380 } // getScrapePageUrl Method 381 382 383 /** 384 * Set Method for class field 'scrapeStart_'. 385 * This field is used as the substring start tag for the return when searching the response string. 386 * 387 * @param scrapeStart_ is the value to set this class field to. 388 * 389 **/ 390 public void setScrapeStart(String scrapeStart) 391 { 392 this.scrapeStart_ = scrapeStart; 393 } // setScrapeStart Method 394 395 396 /** 397 * Get Method for class field 'scrapeStart_'. 398 * This field is used as the substring start tag for the return when searching the response string. 399 * 400 * @return String - The value the class field 'scrapeStart_'. 401 * 402 **/ 403 public String getScrapeStart() 404 { 405 return scrapeStart_; 406 } // getScrapeStart Method 407 408 409 /** 410 * Set Method for class field 'scrapeEnd_'. 411 * This field is used as the substring end tag for the return when searching the response string. 412 * 413 * @param scrapeEnd_ is the value to set this class field to. 414 * 415 **/ 416 public void setScrapeEnd(String scrapeEnd) 417 { 418 this.scrapeEnd_ = scrapeEnd; 419 } // setScrapeEnd Method 420 421 422 /** 423 * Get Method for class field 'scrapeEnd_'. 424 * This field is used as the substring end tag for the return when searching the response string. 425 * 426 * @return String - The value the class field 'scrapeEnd_'. 427 * 428 **/ 429 public String getScrapeEnd() 430 { 431 return scrapeEnd_; 432 } // getScrapeEnd Method 433 434 435 public List<String> getCookies() 436 { 437 return cookies_; 438 } 439 440 441 public void setCookies(List<String> cookies) 442 { 443 if (debugOut_) 444 { 445 System.out.println("New cookies: " + cookies); 446 for (String cookie : cookies) { 447 System.out.println(" --> cookie " + cookie); 448 } 449 } 450 451 452 this.cookies_ = cookies; 453 } 454 455 456 /** 457 * Set Method for class field 'connection_'. 458 * 459 * @param connection_ is the value to set this class field to. 460 * 461 **/ 462 public void setConnection(HttpsURLConnection connection) 463 { 464 this.connection_ = connection; 465 } // setConnection Method 466 467 468 /** 469 * Get Method for class field 'connection_'. 470 * 471 * @return HttpsURLConnection - The value the class field 'connection_'. 472 * 473 **/ 474 public HttpsURLConnection getConnection() 475 { 476 return connection_; 477 } // getConnection Method 478 479 480 /** 481 * Sends the POST to the login url parameters from the classVars. 482 * It also caches the page response text into thew classVar postPageResponse_.<br> 483 */ 484 public boolean doLogin() 485 { 486 return this.doLogin(loginUrl_, requestProps_, loginFormID_, usernameFormElementName_, passwordFormElementName_, username_, password_); 487 } 488 489 490 /** 491 * Sends the POST to the login url with all required parameters. 492 * It also caches the page response text into thew classVar postPageResponse_.<br> 493 * <br> example requestProps<pre> 494 * HashMap <String, String> reqProps = new HashMap<String, String>(); 495 * reqProps.put("Accept","text/html,application/xhtml+xml,application/xml"); 496 * reqProps.put("Accept-Encoding ","gzip, deflate, br"); 497 * reqProps.put("Accept-Language ","en-US,en;q=0.5"); 498 * reqProps.put("Connection","keep-alive"); 499 * reqProps.put("Content-Type","application/x-www-form-urlencoded"); 500 * reqProps.put("Host","red.webarts.bc.ca"); 501 * //reqProps.put("Referer","red.webarts.bc.ca"); 502 * reqProps.put("Upgrade-Insecure-Requests","1"); 503 *</pre><br><B>NOTE:</B> you don't need to add the "User-Agent" or "Content-Length" 504 * 505 * @param loginUrl 506 * @param requestProps 507 * @param formID 508 * @param usernameFormElementName 509 * @param passwordFormElementName 510 * @param username 511 * @param password 512 * @return boolean true if loggedIn 513 **/ 514 public boolean doLogin( String loginUrl, 515 HashMap<String, String> requestProps, 516 String formID, 517 String usernameFormElementName, 518 String passwordFormElementName, 519 String username, 520 String password) 521 { 522 boolean retVal = false; 523 try 524 { 525 baseUrl_ = loginUrl.substring(0, loginUrl.lastIndexOf('/') - 1); 526 if (debugOut_) System.out.println("\n doLogin \n"); 527 if (debugOut_) System.out.println("loginUrl_:" + loginUrl); 528 if (debugOut_) System.out.println("requestProps_:" + requestProps); 529 if (debugOut_) System.out.println("loginFormID_:" + formID); 530 if (debugOut_) System.out.println("usernameFormElementName_:" + usernameFormElementName); 531 if (debugOut_) System.out.println("passwordFormElementName_:" + passwordFormElementName); 532 if (debugOut_) System.out.println("username_,password_:" + username + " , " + password); 533 String page = getPageContent(loginUrl); 534 if (debugOut_) System.out.println("-------------------\nLoginPage:\n" + page + "\n-----------------\n"); 535 536 String postParams = getLoginFormParams( page, 537 formID, 538 usernameFormElementName, passwordFormElementName, 539 username, password); 540 retVal = (sendPost(loginUrl, postParams, requestProps) == 200 ? true : false); 541 } 542 catch (Exception ex) 543 { 544 retVal = false; 545 } 546 alreadyLoggedIn_ = retVal; 547 return retVal; 548 } 549 550 551 /** Scrape and return string between the scrapeStart and scrapeEnd from the url. **/ 552 public String doScrape( String scrapePageUrl, 553 String scrapeStart, 554 String scrapeEnd) 555 { 556 String retVal = ""; 557 try 558 { 559 String result = getPageContent(scrapePageUrl); 560 retVal = result.substring( result.indexOf(scrapeStart), 561 result.indexOf(scrapeEnd)); 562 } 563 catch (Exception ex) 564 { 565 retVal = ""; 566 } 567 return retVal; 568 } 569 570 571 /** Scrape and return string between the scrapeStart and scrapeEnd from the url. **/ 572 public String doScrape( String scrapePageUrl, 573 String scrapeStart, 574 String scrapeEnd, 575 HashMap<String, String> reqProps, 576 boolean useCache) 577 { 578 String retVal = ""; 579 try 580 { 581 String result = pageContentCache_; 582 if(!useCache || "".equals(pageContentCache_)) 583 { 584 if (debugOut_) System.out.println(" doScrape : "+scrapePageUrl); 585 result = getPageContent(scrapePageUrl, reqProps); 586 } 587 pageContentCache_ = result; 588 int s = result.indexOf(scrapeStart); 589 int e = result.indexOf(scrapeEnd); 590 if (s==-1) s=0; 591 if (e==-1) e=result.length(); 592 if (debugOut_) System.out.println(" Scrape Start="+scrapeStart + " index="+s); 593 if (debugOut_) System.out.println(" Scrape End="+scrapeEnd + " index="+e); 594 retVal = result.substring( s, e); 595 } 596 catch (Exception ex) 597 { 598 retVal = ""; 599 } 600 return retVal; 601 } 602 603 604 /** Scrape (not using cache) and return response string from the url. **/ 605 public String doScrape( String scrapePageUrl, 606 String scrapeStart, 607 String scrapeEnd, 608 HashMap<String, String> reqProps) 609 { 610 return doScrape( scrapePageUrl, 611 scrapeStart, 612 scrapeEnd, 613 reqProps, 614 false); 615 } 616 617 618 /** Scrape and return response string from the url. **/ 619 public String doScrape(String scrapePageUrl, HashMap<String, String> reqProps) 620 { 621 return doScrape( scrapePageUrl, 622 getScrapeStart(), 623 getScrapeEnd(), 624 reqProps, 625 false); 626 } 627 628 629 /** scrapes and returns the data string using the default class fields for scrape start and end and requestprops. **/ 630 public String doScrape(String scrapePageUrl) 631 { 632 return doScrape( scrapePageUrl, getScrapeStart(), getScrapeEnd(), getRequestProps(), false); 633 } 634 635 636 /** Scrape (optionally using the cache) and return response string from the url. **/ 637 public String doScrape(HashMap<String, String> reqProps, boolean useCache) 638 { 639 return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), reqProps, useCache); 640 } 641 642 643 /** Scrape and return response string from the url. **/ 644 public String doScrape(HashMap<String, String> reqProps) 645 { 646 return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), reqProps, false); 647 } 648 649 650 /** Scrape (optionally using the cache) and return response string from the url. **/ 651 public String doScrape(boolean useCache) 652 { 653 return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), getRequestProps(), useCache); 654 } 655 656 657 /** Scrape and return response string from the url. **/ 658 public String doScrape() 659 { 660 return doScrape(getScrapePageUrl(), getScrapeStart(), getScrapeEnd(), getRequestProps(), false); 661 } 662 663 664 /** Makes the JSON string pretty with indenting. **/ 665 public static String prettyJson(String jsonStr) 666 { 667 String retVal = jsonStr; 668 retVal = retVal.replace("[", " [\n"); 669 retVal = retVal.replace("]", " ]" + SYSTEM_LINE_SEPERATOR); 670 retVal = retVal.replace("]\n\"", " ]\"" + SYSTEM_LINE_SEPERATOR); 671 retVal = retVal.replace("{", " {" + SYSTEM_LINE_SEPERATOR + " "); 672 retVal = retVal.replace("}", "}" + SYSTEM_LINE_SEPERATOR); 673 retVal = retVal.replace(",", "," + SYSTEM_LINE_SEPERATOR + " "); 674 retVal = retVal.replace("}" + SYSTEM_LINE_SEPERATOR + "," + SYSTEM_LINE_SEPERATOR, " }," + SYSTEM_LINE_SEPERATOR); 675 retVal = retVal.replace("[\n", " [ "); 676 retVal = retVal.replace(" {", " {"); 677 retVal = retVal.replace("[ {", "[\n {"); 678 retVal = retVal.replace("\n}", "\n }"); 679 return retVal; 680 } 681 682 683 /** As the name says. **/ 684 public JsonObject toJsonObject(String jsonStr) 685 { 686 JsonReader jsonReader = Json.createReader(new StringReader(jsonStr)); 687 JsonObject jsO = jsonReader.readObject(); 688 jsonReader.close(); 689 return jsO; 690 } 691 692 693 public JsonStructure toJsonStructure(String jsonStr) 694 { 695 JsonReader jsonReader = Json.createReader(new StringReader(jsonStr)); 696 JsonStructure jsO = jsonReader.read(); 697 jsonReader.close(); 698 return jsO; 699 } 700 701 702 /** Test method to do whatever tests I want. **/ 703 protected void test(String[] args) 704 { 705 test(args, getLoginFormID(), getRequestProps(), 706 getLoginFormID(), getUsernameFormElementName(), 707 getPasswordFormElementName(), getUsername(), getPassword()); 708 } 709 710 711 /** . **/ 712 protected void test(String[] args, 713 String loginUrl, 714 HashMap<String, String> requestProps, 715 String formID, 716 String usernameFormElementName, 717 String passwordFormElementName, 718 String username, 719 String password) 720 { 721 String result = readFileToString("scrapedPageContent-" + dateStr_ + ".json"); 722 JsonStructure jsS = toJsonStructure(result); 723 System.out.println("\n\n---------------------------\nFull Data Structure: \n---------------------------\n" + jsS.toString()); 724 System.out.println("ValueType=" + jsS.getValueType()); 725 JsonObject jsO = (JsonObject)jsS; 726 JsonArray jsA = jsO.getJsonArray("Data"); 727 JsonObject dataHome = jsA.getJsonObject(0); 728 String friendlyName = dataHome.getString("FriendlyName"); 729 System.out.println(" FriendlyName : " + friendlyName); 730 System.out.println(" MarketValue : " + dataHome.getJsonNumber("MarketValue")); 731 System.out.println(" TradeCash : " + dataHome.getJsonNumber("TradeCash")); 732 System.out.println(" BookValue : " + dataHome.getJsonNumber("BookValue")); 733 System.out.println(" ------------------ ------------------------"); 734 System.out.println(" UnrealizedGainLoss : " + dataHome.getJsonNumber("UnrealizedGainLoss")); 735 JsonArray holdings = dataHome.getJsonArray("Holdings"); 736 737 // System.out.println("holdings ValueType="+holdings.getValueType()); 738 System.out.println("\n\n---------------------------\nHoldings:\n---------------------------\n" + prettyJson(holdings.toString())); 739 } 740 741 742 public static void main(String[] args) 743 { 744 String loginUrl = "https://red.webarts.bc.ca/owncloud/"; 745 String scrapePageUrl = "https://red.webarts.bc.ca/owncloud/index.php/apps/files/"; 746 String ocLoginFormElement = "login"; /* id of the form element */ 747 String ocUserLoginElement = "user"; /* the element name in the form to look for */ 748 String ocPasswordElement = "password"; /* the element name in the form to look for */ 749 String ocUserLogin = "tgutwin"; 750 String ocPassword = ""; 751 752 // String baseUrl="/"; 753 HashMap<String, String> reqProps = new HashMap<String, String>(); 754 reqProps.put("Accept", "text/html,application/xhtml+xml,application/xml"); 755 reqProps.put("Accept-Encoding ", "gzip, deflate, br"); 756 reqProps.put("Accept-Language ", "en-US,en;q=0.5"); 757 reqProps.put("Connection", "keep-alive"); 758 reqProps.put("Content-Type", "application/x-www-form-urlencoded"); 759 reqProps.put("Upgrade-Insecure-Requests", "1"); 760 reqProps.put("Host", "red.webarts.bc.ca"); 761 UrlScraper instance = new UrlScraper(); 762 if (args.length > 0 && args[0].toLowerCase().equals("-t")) 763 { 764 instance.test(args, loginUrl, reqProps, ocLoginFormElement, ocUserLoginElement, ocPasswordElement, ocUserLogin, ocPassword); 765 } 766 else 767 { 768 769 /* 770 * @param loginUrl 771 * @param requestProps 772 * @param formID 773 * @param usernameFormElementName 774 * @param passwordFormElementName 775 * @param username 776 * @param password 777 * @return boolean true if loggedIn 778 */ 779 boolean success = false; 780 success = instance.doLogin(loginUrl, reqProps, ocLoginFormElement, ocUserLoginElement, ocPasswordElement, ocUserLogin, ocPassword); 781 782 // 4. success then go to the page you want. 783 String result = ""; 784 String summaryTableHtml = ""; 785 if (success) 786 { 787 result = instance.doScrape(scrapePageUrl); 788 writeStringToFile(result, "ocPageContent-" + instance.dateStr_ + ".html"); 789 System.out.println(result); 790 } 791 } 792 } 793 794 795 /** 796 * Sends a POST request to the url, along with all the passed post parameters and request properties. 797 * 798 * @param url is the url to post this stuff to 799 * @param postParams is a sequential string of params that get sent in this post 800 * @param 801 **/ 802 protected int sendPost(String url, String postParams, HashMap<String, String> reqProps) throws MalformedURLException, IOException, ProtocolException 803 { 804 if (debugOut_) System.out.println("\n sendPost to url=" + url); 805 URL obj = new URL(url); 806 connection_ = (HttpsURLConnection)obj.openConnection(); 807 808 // Act like a browser 809 connection_.setUseCaches(false); 810 connection_.setRequestMethod("POST"); 811 connection_.setRequestProperty("User-Agent", USER_AGENT); 812 for (String key : reqProps.keySet()) 813 { 814 if (!"User-Agent".equals(key) && !"Content-Length".equals(key) && !"Cookie".equals(key) && !"".equals(key.trim()) && !"".equals(((String)reqProps.get(key)).trim())) 815 { 816 connection_.setRequestProperty(key.trim(), ((String)reqProps.get(key)).trim()); 817 } 818 819 820 if (debugOut_) 821 { 822 System.out.println(" setRequestProperty " + key.trim() + " = " + ((String)reqProps.get(key)).trim()); 823 } 824 } 825 connection_.setRequestProperty("Content-Length", Integer.toString(postParams.length())); 826 827 /* 828 * for (String cookie : this.cookies_) 829 * { 830 * if( !"".equals(cookie) 831 * && !"".equals(((String)cookie.split(";", 1)[0]).trim()) ) 832 * { 833 * connection_.addRequestProperty("Cookie", cookie.split(";", 1)[0]); 834 * if(debugOut_) System.out.println(" full cookie = "+cookie); 835 * if(debugOut_) System.out.println(" adding cookie = "+cookie.split(";", 1)[0]); 836 * } 837 * } 838 */ 839 connection_.setDoOutput(true); 840 connection_.setDoInput(true); 841 842 // connection_.setInstanceFollowRedirects(false); 843 cm_.setCookies(connection_); 844 connection_.connect(); 845 846 // Send post request 847 DataOutputStream wr = new DataOutputStream(connection_.getOutputStream()); 848 wr.writeBytes(postParams); 849 wr.flush(); 850 wr.close(); 851 int responseCode = connection_.getResponseCode(); 852 cm_.storeCookies((java.net.URLConnection)connection_); 853 System.out.println("\nSent 'POST' request to URL : " + url); 854 System.out.println("Post parameters : " + postParams); 855 System.out.println("Response Code : " + responseCode); 856 857 // Read the response 858 BufferedReader in = new BufferedReader(new InputStreamReader(connection_.getInputStream())); 859 String inputLine; 860 StringBuffer response = new StringBuffer(); 861 while ((inputLine = in.readLine()) != null) 862 { 863 response.append(inputLine); 864 } 865 in.close(); 866 postPageResponse_ = response.toString(); 867 if (debugOut_) System.out.println("\n\n -----------------\nPostPage Response: "); 868 if (debugOut_) System.out.println(postPageResponse_); 869 870 writeStringToFile(postPageResponse_, "postPageResponse.html"); 871 return responseCode; 872 } 873 874 875 /** Connects/retrieves a URL; pulls its cookies and returns the resulting htnl as a string. 876 **/ 877 protected String getPageContent(String url) throws MalformedURLException, IOException, ProtocolException 878 { 879 return getPageContent(url, null); 880 } 881 882 883 /** 884 * Connects/retrieves a URL; adds its cookies and returns the resulting htnl as a string. 885 * 886 * @param url is the url to get 887 * @param reqProps optional (can be null) map of properties to add as requestProperties 888 **/ 889 protected String getPageContent(String url, HashMap<String, String> reqProps) throws MalformedURLException, IOException, ProtocolException 890 { 891 if (debugOut_) 892 { 893 System.out.println("\n GetPageContent from url= " + url); 894 } 895 URL obj = new URL(url); 896 connection_ = (HttpsURLConnection)obj.openConnection(); 897 898 // default is GET 899 connection_.setRequestMethod("GET"); 900 connection_.setUseCaches(false); 901 902 // act like a browser 903 connection_.setRequestProperty("User-Agent", USER_AGENT); 904 connection_.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); 905 connection_.setRequestProperty("Accept-Language", "en-US,en;q=0.5"); 906 if (reqProps != null) 907 { 908 for (String key : reqProps.keySet()) 909 { 910 if (!"User-Agent".equals(key) && 911 !"Content-Length".equals(key) && 912 !"Cookie".equals(key) && 913 !"".equals(key.trim()) && 914 !"".equals(((String)reqProps.get(key)).trim())) 915 { 916 connection_.setRequestProperty(key.trim(), ((String)reqProps.get(key)).trim()); 917 if (debugOut_) System.out.println(" setRequestProperty " + key.trim() + " = " + ((String)reqProps.get(key)).trim()); 918 } 919 } 920 } 921 if (cookies_ != null) 922 { 923 for (String cookie : this.cookies_) 924 { 925 if (debugOut_) System.out.println("adding cookies to page request: " + cookie + " (" + cookie.split(";", 1)[0] + ")"); 926 connection_.addRequestProperty("Cookie", cookie.split(";", 1)[0]); // add only the first in a group 927 } 928 } 929 930 cm_.setCookies(connection_); 931 connection_.connect(); 932 System.out.println("\nSending 'GET' request to URL : " + url); 933 BufferedReader in = new BufferedReader(new InputStreamReader(connection_.getInputStream())); 934 String inputLine; 935 StringBuffer response = new StringBuffer(); 936 while ((inputLine = in.readLine()) != null) 937 { 938 response.append(inputLine); 939 } 940 in.close(); 941 int responseCode = connection_.getResponseCode(); 942 System.out.println("Response Code : " + responseCode); 943 cm_.storeCookies((java.net.URLConnection)connection_); 944 945 // Get the response cookies 946 setCookies(connection_.getHeaderFields().get("Set-Cookie")); 947 return response.toString(); 948 } 949 950 951 /** Concatenates together the URL parameter string (ie. name=value&name2=val2&someOtherParamName=val3 ) 952 * for a specific login form in the passed URL/html string. It also subsititutes the username and password for 953 * for the appropriate form elements. <br> This parameterString ends up getting sent to the POST request for a that form. 954 * 955 * @param html is the string representation of the URL that has the form to parse 956 * @param formID the form id (or name) to parse parameters from 957 * @param usernameFormElementName the form elementName used for the username field 958 * @param passwordFormElementNamethe form elementName used for the password field 959 * @param username the actual login username to use in the form 960 * @param password the actual login password to use in the form 961 **/ 962 public String getLoginFormParams(String html, 963 String formID, 964 String usernameFormElementName, 965 String passwordFormElementName, 966 String username, 967 String password) throws UnsupportedEncodingException 968 { 969 System.out.println("Extracting form's data..."); 970 if (debugOut_) System.out.println("\n ---------------------------------------- \n" + html + "\n ---------------------------------------- \n"); 971 972 Document doc = Jsoup.parse(html); 973 if (debugOut_) System.out.println("\n doc is parsed?=" + (doc != null)); 974 if (debugOut_) System.out.println(" Looking for form id=" + formID); 975 976 Element loginform = null; 977 loginform = doc.getElementById(formID); 978 if (loginform == null) 979 { 980 loginform = doc.getElementsByAttributeValue("name", formID).first(); 981 } 982 else 983 if (loginform == null) 984 { 985 loginform = doc.getElementsByAttributeValue("class", formID).first(); 986 } 987 988 Elements inputElements = loginform.getElementsByTag("input"); 989 List<String> paramList = new ArrayList<String>(); 990 for (Element inputElement : inputElements) { 991 String key = inputElement.attr("name"); 992 String value = inputElement.attr("value"); 993 if (debugOut_) 994 { 995 System.out.println(" form element " + key + " = " + value); 996 } 997 if (key.equals(usernameFormElementName)) 998 { 999 value = username; 1000 } 1001 else 1002 if (key.equals(passwordFormElementName)) 1003 { 1004 value = password; 1005 } 1006 else 1007 if (key.equals("timezone")) 1008 { 1009 value = "America/Los_Angeles"; 1010 } 1011 else 1012 if (key.equals("timezone-offset")) 1013 { 1014 value = "-7"; 1015 } 1016 paramList.add(key + "=" + URLEncoder.encode(value, "UTF-8")); 1017 } 1018 1019 // build parameters list 1020 StringBuilder result = new StringBuilder(); 1021 for (String param : paramList) 1022 { 1023 if (result.length() == 0) 1024 { 1025 result.append(param); 1026 } 1027 else 1028 { 1029 result.append("&" + param); 1030 } 1031 } 1032 return result.toString(); 1033 } 1034 1035 1036 /** 1037 * Abstracts the writing of string to a file. 1038 * 1039 * @param s is the String to writeout 1040 * @param fileName is the file name of the file to write the String into 1041 * @return if success.. the full pathed filename is returned else null 1042 **/ 1043 public static String writeStringToFile(String s, String fileName) 1044 { 1045 return writeStringToFile(s, fileName, false); 1046 } 1047 1048 1049 /** 1050 * Abstracts the writing of string to a (zip) file (Zip NOT IMPLEMENTED YET). 1051 * 1052 * @param s is the String to writeout 1053 * @param fileName is the file name of the file to write the String into 1054 * @param zipCompress boolean fall to compress with zip compression 1055 * @return if success.. the full pathed filename is returned else null 1056 **/ 1057 public static String writeStringToFile(String s, String fileName, boolean zipCompress) 1058 { 1059 String retVal = fileName; 1060 try 1061 { 1062 // FileWriter was not closing the stream 1063 /* 1064 * FileWriter f = new FileWriter(fileName); 1065 * f.write(s); 1066 * f.flush(); 1067 * f.close(); 1068 * f = null; 1069 */ 1070 FileOutputStream fos = new FileOutputStream(fileName); 1071 byte[] strBytes = s.getBytes(); 1072 fos.write(strBytes); 1073 fos.flush(); 1074 fos.close(); 1075 fos = null; 1076 System.gc(); // this is required because a bug in Java won't realease 1077 } 1078 catch (IOException ioEx) 1079 { 1080 System.out.println("\nERROR Writing file: " + fileName); 1081 retVal = null; 1082 } 1083 return retVal; 1084 } 1085 1086 1087 /** 1088 * Abstracts the reading of a file and returns the contents as a String. 1089 * 1090 * @param fileName is the file naem to read into a String 1091 * @return the Text file contents as a String 1092 **/ 1093 public static String readFileToString(String fileName) 1094 { 1095 String stringLine; 1096 BufferedReader in; 1097 StringBuffer stringOut = new StringBuffer(); 1098 try 1099 { 1100 in = new BufferedReader(new FileReader(fileName)); 1101 while ((stringLine = in.readLine()) != null) { 1102 stringOut.append(stringLine); 1103 stringOut.append(SYSTEM_LINE_SEPERATOR); 1104 } 1105 } 1106 catch (FileNotFoundException fnfEx) 1107 { 1108 System.out.println("Cannot find file: " + fileName); 1109 } 1110 catch (IOException ioEx) 1111 { 1112 System.out.println("Error Reading File to String: " + fileName); 1113 } 1114 return stringOut.toString(); 1115 } 1116}