001/* 002 * $Log: Diff.java,v $ 003 * Revision 1.7 2009/01/19 03:05:26 stuart 004 * Fix StackOverflow bug with heuristic on reported by Jimmy Han. 005 * 006 * Revision 1.6 2003/03/06 22:51:32 stuart 007 * Convert to CVS 008 * 009 * Revision 1.5 2002/07/19 19:14:40 stuart 010 * fix reverseScript, make change ctor public, update docs 011 * 012 * Revision 1.4 2002/04/09 17:53:39 stuart 013 * More flexible interface for diff() function. 014 * 015 * Revision 1.3 2000/03/03 21:58:03 stuart 016 * move discard_confusing_lines and shift_boundaries to class file_data 017 * 018 * Revision 1.2 2000/03/02 16:37:38 stuart 019 * Add GPL and copyright 020 * 021*/ 022// package bmsi.util; 023package ca.bc.webarts.tools; 024 025import java.util.Hashtable; 026 027/** A class to compare vectors of objects. The result of comparison 028 * is a list of <code>change</code> objects which form an 029 * edit script. The objects compared are traditionally lines 030 * of text from two files. Comparison options such as "ignore 031 * whitespace" are implemented by modifying the <code>equals</code> 032 * and <code>hashcode</code> methods for the objects compared. 033 * <p> 034 * The basic algorithm is described in: </br> 035 * "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 036 * Algorithmica Vol. 1 No. 2, 1986, p 251. 037 * <p> 038 * This class outputs different results from GNU diff 1.15 on some 039 * inputs. Our results are actually better (smaller change list, smaller 040 * total size of changes), but it would be nice to know why. Perhaps 041 * there is a memory overwrite bug in GNU diff 1.15. 042 * 043 * @author Stuart D. Gathman, translated from GNU diff 1.15 044 * Copyright (C) 2000 Business Management Systems, Inc. 045 * <p> 046 * This program is free software; you can redistribute it and/or modify 047 * it under the terms of the GNU General Public License as published by 048 * the Free Software Foundation; either version 1, or (at your option) 049 * any later version. 050 * <p> 051 * This program is distributed in the hope that it will be useful, 052 * but WITHOUT ANY WARRANTY; without even the implied warranty of 053 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 054 * GNU General Public License for more details. 055 * <p> 056 * You should have received a copy of the <a href=COPYING.txt> 057 * GNU General Public License</a> 058 * along with this program; if not, write to the Free Software 059 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 060 * 061 */ 062 063public class Diff 064{ 065 066 /** Prepare to find differences between two arrays. Each element of 067 * the arrays is translated to an "equivalence number" based on 068 * the result of <code>equals</code>. The original Object arrays 069 * are no longer needed for computing the differences. They will 070 * be needed again later to print the results of the comparison as 071 * an edit script, if desired. 072 */ 073 public Diff(Object[] a, Object[] b) 074 { 075 Hashtable h = new Hashtable(a.length + b.length); 076 filevec[0] = new file_data(a, h); 077 filevec[1] = new file_data(b, h); 078 } 079 080 /** 1 more than the maximum equivalence value used for this or its 081 * sibling file. */ 082 private int equiv_max = 1; 083 084 /** When set to true, the comparison uses a heuristic to speed it up. 085 * With this heuristic, for files with a constant small density 086 * of changes, the algorithm is linear in the file size. */ 087 public boolean heuristic = false; 088 089 /** When set to true, the algorithm returns a guarranteed minimal 090 * set of changes. This makes things slower, sometimes much slower. */ 091 public boolean no_discards = false; 092 093 private int[] xvec, yvec; /* Vectors being compared. */ 094 private int[] fdiag; /* Vector, indexed by diagonal, containing 095 the X coordinate of the point furthest 096 along the given diagonal in the forward 097 search of the edit matrix. */ 098 private int[] bdiag; /* Vector, indexed by diagonal, containing 099 the X coordinate of the point furthest 100 along the given diagonal in the backward 101 search of the edit matrix. */ 102 private int fdiagoff, bdiagoff; 103 private final file_data[] filevec = new file_data[2]; 104 private int cost; 105 /** Snakes bigger than this are considered "big". */ 106 private static final int SNAKE_LIMIT = 20; 107 108 /** Find the midpoint of the shortest edit script for a specified 109 * portion of the two files. 110 * 111 * We scan from the beginnings of the files, and simultaneously from the ends, 112 * doing a breadth-first search through the space of edit-sequence. 113 * When the two searches meet, we have found the midpoint of the shortest 114 * edit sequence. 115 * 116 * The value returned is the number of the diagonal on which the midpoint lies. 117 * The diagonal number equals the number of inserted lines minus the number 118 * of deleted lines (counting only lines before the midpoint). 119 * The edit cost is stored into COST; this is the total number of 120 * lines inserted or deleted (counting only lines before the midpoint). 121 * 122 * This function assumes that the first lines of the specified portions 123 * of the two files do not match, and likewise that the last lines do not 124 * match. The caller must trim matching lines from the beginning and end 125 * of the portions it is going to specify. 126 * 127 * Note that if we return the "wrong" diagonal value, or if 128 * the value of bdiag at that diagonal is "wrong", 129 * the worst this can do is cause suboptimal diff output. 130 * It cannot cause incorrect diff output. */ 131 132 private int diag(int xoff, int xlim, int yoff, int ylim) 133 { 134 final int[] fd = fdiag; // Give the compiler a chance. 135 final int[] bd = bdiag; // Additional help for the compiler. 136 final int[] xv = xvec; // Still more help for the compiler. 137 final int[] yv = yvec; // And more and more . . . 138 final int dmin = xoff - ylim; // Minimum valid diagonal. 139 final int dmax = xlim - yoff; // Maximum valid diagonal. 140 final int fmid = xoff - yoff; // Center diagonal of top-down search. 141 final int bmid = xlim - ylim; // Center diagonal of bottom-up search. 142 int fmin = fmid, fmax = fmid; // Limits of top-down search. 143 int bmin = bmid, bmax = bmid; // Limits of bottom-up search. 144 /* True if southeast corner is on an odd 145 diagonal with respect to the northwest. */ 146 final boolean odd = (fmid - bmid & 1) != 0; 147 148 fd[ fdiagoff + fmid] = xoff; 149 bd[ bdiagoff + bmid] = xlim; 150 151 for (int c = 1;; ++c) 152 { 153 int d; /* Active diagonal. */ 154 boolean big_snake = false; 155 156 /* Extend the top-down search by an edit step in each diagonal. */ 157 if (fmin > dmin) 158 { 159 fd[ fdiagoff + -- fmin - 1] = -1; 160 } 161 else 162 { 163 ++fmin; 164 } 165 if (fmax < dmax) 166 { 167 fd[ fdiagoff + ++ fmax + 1] = -1; 168 } 169 else 170 { 171 --fmax; 172 } 173 for (d = fmax; d >= fmin; d -= 2) 174 { 175 int x, y, oldx, tlo = fd[ fdiagoff + d - 1], thi = fd[ fdiagoff + d + 1]; 176 177 if (tlo >= thi) 178 { 179 x = tlo + 1; 180 } 181 else 182 { 183 x = thi; 184 } 185 oldx = x; 186 y = x - d; 187 while (x < xlim && y < ylim && xv[x] == yv[y]) 188 { 189 ++x; ++y; 190 } 191 if (x - oldx > SNAKE_LIMIT) 192 { 193 big_snake = true; 194 } 195 fd[ fdiagoff + d] = x; 196 if (odd && bmin <= d && d <= bmax && bd[ bdiagoff + d] <= fd[ fdiagoff + d]) 197 { 198 cost = 2 * c - 1; 199 return d; 200 } 201 } 202 203 /* Similar extend the bottom-up search. */ 204 if (bmin > dmin) 205 { 206 bd[ bdiagoff + -- bmin - 1] = Integer.MAX_VALUE; 207 } 208 else 209 { 210 ++bmin; 211 } 212 if (bmax < dmax) 213 { 214 bd[ bdiagoff + ++ bmax + 1] = Integer.MAX_VALUE; 215 } 216 else 217 { 218 --bmax; 219 } 220 for (d = bmax; d >= bmin; d -= 2) 221 { 222 int x, y, oldx, tlo = bd[ bdiagoff + d - 1], thi = bd[ bdiagoff + d + 1]; 223 224 if (tlo < thi) 225 { 226 x = tlo; 227 } 228 else 229 { 230 x = thi - 1; 231 } 232 oldx = x; 233 y = x - d; 234 while (x > xoff && y > yoff && xv[ x - 1] == yv[ y - 1]) 235 { 236 --x; --y; 237 } 238 if (oldx - x > SNAKE_LIMIT) 239 { 240 big_snake = true; 241 } 242 bd[ bdiagoff + d] = x; 243 if (! odd && fmin <= d && d <= fmax && bd[ bdiagoff + d] <= fd[ fdiagoff + d]) 244 { 245 cost = 2 * c; 246 return d; 247 } 248 } 249 250 /* Heuristic: check occasionally for a diagonal that has made 251 lots of progress compared with the edit distance. 252 If we have any such, find the one that has made the most 253 progress and return it as if it had succeeded. 254 255 With this heuristic, for files with a constant small density 256 of changes, the algorithm is linear in the file size. */ 257 258 if (c > 200 && big_snake && heuristic) 259 { 260 int best = 0; 261 int bestpos = -1; 262 263 for (d = fmax; d >= fmin; d -= 2) 264 { 265 int dd = d - fmid; 266 int x = fd[ fdiagoff + d]; 267 int y = x - d; 268 int v = (x - xoff) * 2 - dd; 269 if (v > 12 * (c + (dd < 0 ? - dd : dd))) 270 { 271 if (v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) 272 { 273 /* We have a good enough best diagonal; 274 now insist that it end with a significant snake. */ 275 int k; 276 277 for (k = 1; xvec[ x - k] == yvec[ y - k]; k++) 278 { 279 if (k == SNAKE_LIMIT) 280 { 281 best = v; 282 bestpos = d; 283 break; 284 } 285 } 286 } 287 } 288 } 289 if (best > 0) 290 { 291 cost = 2 * c - 1; 292 return bestpos; 293 } 294 295 best = 0; 296 for (d = bmax; d >= bmin; d -= 2) 297 { 298 int dd = d - bmid; 299 int x = bd[ bdiagoff + d]; 300 int y = x - d; 301 int v = (xlim - x) * 2 + dd; 302 if (v > 12 * (c + (dd < 0 ? - dd : dd))) 303 { 304 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) 305 { 306 /* We have a good enough best diagonal; 307 now insist that it end with a significant snake. */ 308 int k; 309 310 for (k = 0; xvec[ x + k] == yvec[ y + k]; k++) 311 { 312 if (k == SNAKE_LIMIT) 313 { 314 best = v; 315 bestpos = d; 316 break; 317 } 318 } 319 } 320 } 321 } 322 if (best > 0) 323 { 324 cost = 2 * c - 1; 325 return bestpos; 326 } 327 } 328 } 329 } 330 331 /** Compare in detail contiguous subsequences of the two files 332 * which are known, as a whole, to match each other. 333 * 334 * The results are recorded in the vectors filevec[N].changed_flag, by 335 * storing a 1 in the element for each line that is an insertion or deletion. 336 * 337 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 338 * 339 * Note that XLIM, YLIM are exclusive bounds. 340 * All line numbers are origin-0 and discarded lines are not counted. */ 341 342 private void compareseq(int xoff, int xlim, int yoff, int ylim) 343 { 344 /* Slide down the bottom initial diagonal. */ 345 while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) 346 { 347 ++xoff; ++yoff; 348 } 349 /* Slide up the top initial diagonal. */ 350 while (xlim > xoff && ylim > yoff && xvec[ xlim - 1] == yvec[ ylim - 1]) 351 { 352 --xlim; --ylim; 353 } 354 355 /* Handle simple cases. */ 356 if (xoff == xlim) 357 { 358 while (yoff < ylim) 359 { 360 filevec[1].changed_flag[1 + filevec[1].realindexes[yoff++]] = true; 361 } 362 } 363 else if (yoff == ylim) 364 { 365 while (xoff < xlim) 366 { 367 filevec[0].changed_flag[1 + filevec[0].realindexes[xoff++]] = true; 368 } 369 } 370 else 371 { 372 /* Find a point of correspondence in the middle of the files. */ 373 374 int d = diag(xoff, xlim, yoff, ylim); 375 int c = cost; 376 int f = fdiag[ fdiagoff + d]; 377 int b = bdiag[ bdiagoff + d]; 378 379 if (c == 1) 380 { 381 /* This should be impossible, because it implies that 382 one of the two subsequences is empty, 383 and that case was handled above without calling `diag'. 384 Let's verify that this is true. */ 385 throw new IllegalArgumentException("Empty subsequence"); 386 } 387 else 388 { 389 /* Use that point to split this problem into two subproblems. */ 390 compareseq(xoff, b, yoff, b - d); 391 /* This used to use f instead of b, 392 but that is incorrect! 393 It is not necessarily the case that diagonal d 394 has a snake from b to f. */ 395 compareseq(b, xlim, b - d, ylim); 396 } 397 } 398 } 399 400 /** Discard lines from one file that have no matches in the other file. 401 */ 402 403 private void discard_confusing_lines() 404 { 405 filevec[0].discard_confusing_lines(filevec[1]); 406 filevec[1].discard_confusing_lines(filevec[0]); 407 } 408 409 private boolean inhibit = false; 410 411 /** Adjust inserts/deletes of blank lines to join changes 412 * as much as possible. 413 */ 414 415 private void shift_boundaries() 416 { 417 if (inhibit) 418 { 419 return; 420 } 421 filevec[0].shift_boundaries(filevec[1]); 422 filevec[1].shift_boundaries(filevec[0]); 423 } 424 425 public interface ScriptBuilder 426 { 427 /** Scan the tables of which lines are inserted and deleted, 428 * producing an edit script. 429 * @param changed0 true for lines in first file which do not match 2nd 430 * @param len0 number of lines in first file 431 * @param changed1 true for lines in 2nd file which do not match 1st 432 * @param len1 number of lines in 2nd file 433 * @return a linked list of changes - or null 434 */ 435 public change build_script( boolean[] changed0, int len0, boolean[] changed1, int len1 ); 436 } 437 438 /** Scan the tables of which lines are inserted and deleted, 439 * producing an edit script in reverse order. */ 440 441 static class ReverseScript implements ScriptBuilder 442 { 443 public change build_script( final boolean[] changed0, int len0, final boolean[] changed1, int len1) 444 { 445 change script = null; 446 int i0 = 0, i1 = 0; 447 while (i0 < len0 || i1 < len1) 448 { 449 if (changed0[1 + i0] || changed1[1 + i1]) 450 { 451 int line0 = i0, line1 = i1; 452 453 /* Find # lines changed here in each file. */ 454 while (changed0[1 + i0]) 455 { 456 ++i0; 457 } 458 459 while (changed1[1 + i1]) 460 { 461 ++i1; 462 } 463 464 465 /* Record this change. */ 466 script = new change(line0, line1, i0 - line0, i1 - line1, script); 467 } 468 469 /* We have reached lines in the two files that match each other. */ 470 i0++; i1++; 471 } 472 473 return script; 474 } 475 } 476 477 static class ForwardScript implements ScriptBuilder 478 { 479 /** Scan the tables of which lines are inserted and deleted, 480 * producing an edit script in forward order. */ 481 public change build_script( final boolean[] changed0, int len0, final boolean[] changed1, int len1) 482 { 483 change script = null; 484 int i0 = len0, i1 = len1; 485 486 while (i0 >= 0 || i1 >= 0) 487 { 488 if (changed0[i0] || changed1[i1]) 489 { 490 int line0 = i0, line1 = i1; 491 492 /* Find # lines changed here in each file. */ 493 while (changed0[i0]) 494 { 495 --i0; 496 } 497 498 while (changed1[i1]) 499 { 500 --i1; 501 } 502 503 504 /* Record this change. */ 505 script = new change(i0, i1, line0 - i0, line1 - i1, script); 506 } 507 508 /* We have reached lines in the two files that match each other. */ 509 i0--; i1--; 510 } 511 512 return script; 513 } 514 } 515 516 /** Standard ScriptBuilders. */ 517 518 public static final ScriptBuilder forwardScript = new ForwardScript(), reverseScript = new ReverseScript(); 519 520 /* Report the differences of two files. DEPTH is the current directory 521 depth. */ 522 public final change diff_2(final boolean reverse) 523 { 524 return diff(reverse ? reverseScript : forwardScript); 525 } 526 527 /** Get the results of comparison as an edit script. The script 528 * is described by a list of changes. The standard ScriptBuilder 529 * implementations provide for forward and reverse edit scripts. 530 * Alternate implementations could, for instance, list common elements 531 * instead of differences. 532 * @param bld an object to build the script from change flags 533 * @return the head of a list of changes 534 */ 535 public change diff(final ScriptBuilder bld) 536 { 537 538 /* Some lines are obviously insertions or deletions 539 because they don't match anything. Detect them now, 540 and avoid even thinking about them in the main comparison algorithm. */ 541 542 discard_confusing_lines(); 543 544 /* Now do the main comparison algorithm, considering just the 545 undiscarded lines. */ 546 547 xvec = filevec[0].undiscarded; 548 yvec = filevec[1].undiscarded; 549 550 int diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; 551 fdiag = new int[diags]; 552 fdiagoff = filevec[1].nondiscarded_lines + 1; 553 bdiag = new int[diags]; 554 bdiagoff = filevec[1].nondiscarded_lines + 1; 555 556 compareseq(0, filevec[0].nondiscarded_lines, 0, filevec[1].nondiscarded_lines); 557 fdiag = null; 558 bdiag = null; 559 560 /* Modify the results slightly to make them prettier 561 in cases where that can validly be done. */ 562 563 shift_boundaries(); 564 565 /* Get the results of comparison in the form of a chain 566 of `struct change's -- an edit script. */ 567 return bld.build_script (filevec[0].changed_flag, filevec[0].buffered_lines, filevec[1].changed_flag, filevec[1].buffered_lines ); 568 569 } 570 571 /** The result of comparison is an "edit script": a chain of change objects. 572 * Each change represents one place where some lines are deleted 573 * and some are inserted. 574 * 575 * LINE0 and LINE1 are the first affected lines in the two files (origin 0). 576 * DELETED is the number of lines deleted here from file 0. 577 * INSERTED is the number of lines inserted here in file 1. 578 * 579 * If DELETED is 0 then LINE0 is the number of the line before 580 * which the insertion was done; vice versa for INSERTED and LINE1. */ 581 582 public static class change 583 { 584 /** Previous or next edit command. */ 585 public change link; 586 /** # lines of file 1 changed here. */ 587 public final int inserted; 588 /** # lines of file 0 changed here. */ 589 public final int deleted; 590 /** Line number of 1st deleted line. */ 591 public final int line0; 592 /** Line number of 1st inserted line. */ 593 public final int line1; 594 595 /** Cons an additional entry onto the front of an edit script OLD. 596 * LINE0 and LINE1 are the first affected lines in the two files (origin 0). 597 * DELETED is the number of lines deleted here from file 0. 598 * INSERTED is the number of lines inserted here in file 1. 599 * 600 * If DELETED is 0 then LINE0 is the number of the line before 601 * which the insertion was done; vice versa for INSERTED and LINE1. */ 602 public change(int line0, int line1, int deleted, int inserted, change old) 603 { 604 this.line0 = line0; 605 this.line1 = line1; 606 this.inserted = inserted; 607 this.deleted = deleted; 608 this.link = old; 609 // System.err.println(line0+","+line1+","+inserted+","+deleted); 610 } 611 } 612 613 /** Data on one input file being compared. 614 */ 615 616 class file_data 617 { 618 619 /** Allocate changed array for the results of comparison. */ 620 void clear() 621 { 622 /* Allocate a flag for each line of each file, saying whether that line 623 is an insertion or deletion. 624 Allocate an extra element, always zero, at each end of each vector. 625 */ 626 changed_flag = new boolean[ buffered_lines + 2]; 627 } 628 629 /** Return equiv_count[I] as the number of lines in this file 630 * that fall in equivalence class I. 631 * @return the array of equivalence class counts. 632 */ 633 int[] equivCount() 634 { 635 int[] equiv_count = new int[equiv_max]; 636 for (int i = 0; i < buffered_lines; ++i) 637 { 638 ++equiv_count[equivs[i]]; 639 } 640 641 return equiv_count; 642 } 643 644 /** Discard lines that have no matches in another file. 645 * 646 * A line which is discarded will not be considered by the actual 647 * comparison algorithm; it will be as if that line were not in the file. 648 * The file's `realindexes' table maps virtual line numbers 649 * (which don't count the discarded lines) into real line numbers; 650 * this is how the actual comparison algorithm produces results 651 * that are comprehensible when the discarded lines are counted. 652 * <p> 653 * When we discard a line, we also mark it as a deletion or insertion 654 * so that it will be printed in the output. 655 * @param f the other file 656 */ 657 void discard_confusing_lines(file_data f) 658 { 659 clear(); 660 /* Set up table of which lines are going to be discarded. */ 661 final byte[] discarded = discardable(f.equivCount()); 662 663 /* Don't really discard the provisional lines except when they occur 664 in a run of discardables, with nonprovisionals at the beginning 665 and end. */ 666 filterDiscards(discarded); 667 668 /* Actually discard the lines. */ 669 discard(discarded); 670 } 671 672 /** Mark to be discarded each line that matches no line of another file. 673 * If a line matches many lines, mark it as provisionally discardable. 674 * @see equivCount() 675 * @param counts The count of each equivalence number for the other file. 676 * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable 677 * for each line 678 */ 679 680 private byte[] discardable(final int[] counts) 681 { 682 final int end = buffered_lines; 683 final byte[] discards = new byte[end]; 684 final int[] equivs = this.equivs; 685 int many = 5; 686 int tem = end / 64; 687 688 /* Multiply MANY by approximate square root of number of lines. 689 That is the threshold for provisionally discardable lines. */ 690 while ((tem = tem >> 2) > 0) 691 { 692 many *= 2; 693 } 694 695 696 for (int i = 0; i < end; i++) 697 { 698 int nmatch; 699 if (equivs[i] == 0) 700 { 701 continue; 702 } 703 nmatch = counts[equivs[i]]; 704 if (nmatch == 0) 705 { 706 discards[i] = 1; 707 } 708 else if (nmatch > many) 709 { 710 discards[i] = 2; 711 } 712 } 713 return discards; 714 } 715 716 /** Don't really discard the provisional lines except when they occur 717 * in a run of discardables, with nonprovisionals at the beginning 718 * and end. */ 719 720 private void filterDiscards(final byte[] discards) 721 { 722 final int end = buffered_lines; 723 724 for (int i = 0; i < end; i++) 725 { 726 /* Cancel provisional discards not in middle of run of discards. */ 727 if (discards[i] == 2) 728 { 729 discards[i] = 0; 730 } 731 else if (discards[i] != 0) 732 { 733 /* We have found a nonprovisional discard. */ 734 int j; 735 int length; 736 int provisional = 0; 737 738 /* Find end of this run of discardable lines. 739 Count how many are provisionally discardable. */ 740 for (j = i; j < end; j++) 741 { 742 if (discards[j] == 0) 743 { 744 break; 745 } 746 if (discards[j] == 2) 747 { 748 ++provisional; 749 } 750 } 751 752 /* Cancel provisional discards at end, and shrink the run. */ 753 while (j > i && discards[ j - 1] == 2) 754 { 755 discards[ --j] = 0; --provisional; 756 } 757 758 /* Now we have the length of a run of discardable lines 759 whose first and last are not provisional. */ 760 length = j - i; 761 762 /* If 1/4 of the lines in the run are provisional, 763 cancel discarding of all provisional lines in the run. */ 764 if (provisional * 4 > length) 765 { 766 while (j > i) 767 { 768 if (discards[ --j] == 2) 769 { 770 discards[j] = 0; 771 } 772 } 773 } 774 else 775 { 776 int consec; 777 int minimum = 1; 778 int tem = length / 4; 779 780 /* MINIMUM is approximate square root of LENGTH/4. 781 A subrun of two or more provisionals can stand 782 when LENGTH is at least 16. 783 A subrun of 4 or more can stand when LENGTH >= 64. */ 784 while ((tem = tem >> 2) > 0) 785 { 786 minimum *= 2; 787 } 788 789 minimum++; 790 791 /* Cancel any subrun of MINIMUM or more provisionals 792 within the larger run. */ 793 for (j = 0, consec = 0; j < length; j++) 794 { 795 if (discards[ i + j] != 2) 796 { 797 consec = 0; 798 } 799 else{} 800 } 801 if (minimum == ++consec) /* Back up to start of subrun, to cancel it all. */ 802 { 803 j -= consec; 804 } 805 else if (minimum < consec) 806 { 807 discards[ i + j] = 0; 808 } 809 810 /* Scan from beginning of run 811 until we find 3 or more nonprovisionals in a row 812 or until the first nonprovisional at least 8 lines in. 813 Until that point, cancel any provisionals. */ 814 for (j = 0, consec = 0; j < length; j++) 815 { 816 if (j >= 8 && discards[ i + j] == 1) 817 { 818 break; 819 } 820 if (discards[ i + j] == 2) 821 { 822 consec = 0; discards[ i + j] = 0; 823 } 824 else if (discards[ i + j] == 0) 825 { 826 consec = 0; 827 } 828 else 829 { 830 consec++; 831 } 832 if (consec == 3) 833 { 834 break; 835 } 836 } 837 838 /* I advances to the last line of the run. */ 839 i += length - 1; 840 841 /* Same thing, from end. */ 842 for (j = 0, consec = 0; j < length; j++) 843 { 844 if (j >= 8 && discards[ i - j] == 1) 845 { 846 break; 847 } 848 if (discards[ i - j] == 2) 849 { 850 consec = 0; discards[ i - j] = 0; 851 } 852 else if (discards[ i - j] == 0) 853 { 854 consec = 0; 855 } 856 else 857 { 858 consec++; 859 } 860 if (consec == 3) 861 { 862 break; 863 } 864 } 865 } 866 } 867 } 868 } 869 870 /** Actually discard the lines. 871 * @param discards flags lines to be discarded 872 */ 873 private void discard(final byte[] discards) 874 { 875 final int end = buffered_lines; 876 int j = 0; 877 for (int i = 0; i < end; ++i) 878 { 879 if (no_discards || discards[i] == 0) 880 { 881 undiscarded[j] = equivs[i]; 882 realindexes[j++] = i; 883 } 884 else 885 { 886 changed_flag[1 + i] = true; 887 } 888 } 889 890 nondiscarded_lines = j; 891 } 892 893 file_data(Object[] data, Hashtable h) 894 { 895 buffered_lines = data.length; 896 897 equivs = new int[buffered_lines]; 898 undiscarded = new int[buffered_lines]; 899 realindexes = new int[buffered_lines]; 900 901 for (int i = 0; i < data.length; ++i) 902 { 903 Integer ir = (Integer) h.get(data[i]); 904 if (ir == null) 905 { 906 h.put(data[i], new Integer(equivs[i] = equiv_max++)); 907 } 908 else 909 { 910 equivs[i] = ir.intValue(); 911 } 912 } 913 } 914 915 /** Adjust inserts/deletes of blank lines to join changes 916 * as much as possible. 917 * 918 * We do something when a run of changed lines include a blank 919 * line at one end and have an excluded blank line at the other. 920 * We are free to choose which blank line is included. 921 * `compareseq' always chooses the one at the beginning, 922 * but usually it is cleaner to consider the following blank line 923 * to be the "change". The only exception is if the preceding blank line 924 * would join this change to other changes. 925 * @param f the file being compared against 926 */ 927 928 void shift_boundaries(file_data f) 929 { 930 final boolean[] changed = changed_flag; 931 final boolean[] other_changed = f.changed_flag; 932 int i = 0; 933 int j = 0; 934 int i_end = buffered_lines; 935 int preceding = -1; 936 int other_preceding = -1; 937 938 for (;;) 939 { 940 int start, end, other_start; 941 942 /* Scan forwards to find beginning of another run of changes. 943 Also keep track of the corresponding point in the other file. */ 944 945 while (i < i_end && !changed[1 + i]) 946 { 947 while (other_changed[1 + j++]) /* Non-corresponding lines in the other file 948 will count as the preceding batch of changes. */ 949 { 950 other_preceding = j; 951 } 952 953 i++; 954 } 955 956 if (i == i_end) 957 { 958 break; 959 } 960 961 start = i; 962 other_start = j; 963 964 for (;;) 965 { 966 /* Now find the end of this run of changes. */ 967 968 while (i < i_end && changed[1 + i]) 969 { 970 i++; 971 } 972 973 end = i; 974 975 /* If the first changed line matches the following unchanged one, 976 and this run does not follow right after a previous run, 977 and there are no lines deleted from the other file here, 978 then classify the first changed line as unchanged 979 and the following line as changed in its place. */ 980 981 /* You might ask, how could this run follow right after another? 982 Only because the previous run was shifted here. */ 983 984 if (end != i_end && equivs[start] == equivs[end] && !other_changed[1 + j] && end != i_end && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) 985 { 986 changed[1 + end++] = true; 987 changed[1 + start++] = false; 988 ++i; 989 /* Since one line-that-matches is now before this run 990 instead of after, we must advance in the other file 991 to keep in synch. */ 992 ++j; 993 } 994 else 995 { 996 break; 997 } 998 } 999 1000 preceding = i; 1001 other_preceding = j; 1002 } 1003 } 1004 1005 /** Number of elements (lines) in this file. */ 1006 final int buffered_lines; 1007 1008 /** Vector, indexed by line number, containing an equivalence code for 1009 * each line. It is this vector that is actually compared with that 1010 * of another file to generate differences. */ 1011 private final int[] equivs; 1012 1013 /** Vector, like the previous one except that 1014 * the elements for discarded lines have been squeezed out. */ 1015 final int[] undiscarded; 1016 1017 /** Vector mapping virtual line numbers (not counting discarded lines) 1018 * to real ones (counting those lines). Both are origin-0. */ 1019 final int[] realindexes; 1020 1021 /** Total number of nondiscarded lines. */ 1022 int nondiscarded_lines; 1023 1024 /** Array, indexed by real origin-1 line number, 1025 * containing true for a line that is an insertion or a deletion. 1026 * The results of comparison are stored here. */ 1027 boolean[] changed_flag; 1028 1029 } 1030} 1031