vx32

Local 9vx git repository for patches.
git clone git://r-36.net/vx32
Log | Files | Refs

blocksort.c (32594B)


      1 
      2 /*-------------------------------------------------------------*/
      3 /*--- Block sorting machinery                               ---*/
      4 /*---                                           blocksort.c ---*/
      5 /*-------------------------------------------------------------*/
      6 
      7 /*--
      8   This file is a part of bzip2 and/or libbzip2, a program and
      9   library for lossless, block-sorting data compression.
     10 
     11   Copyright (C) 1996-2005 Julian R Seward.  All rights reserved.
     12 
     13   Redistribution and use in source and binary forms, with or without
     14   modification, are permitted provided that the following conditions
     15   are met:
     16 
     17   1. Redistributions of source code must retain the above copyright
     18      notice, this list of conditions and the following disclaimer.
     19 
     20   2. The origin of this software must not be misrepresented; you must 
     21      not claim that you wrote the original software.  If you use this 
     22      software in a product, an acknowledgment in the product 
     23      documentation would be appreciated but is not required.
     24 
     25   3. Altered source versions must be plainly marked as such, and must
     26      not be misrepresented as being the original software.
     27 
     28   4. The name of the author may not be used to endorse or promote 
     29      products derived from this software without specific prior written 
     30      permission.
     31 
     32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
     38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
     40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     43 
     44   Julian Seward, Cambridge, UK.
     45   jseward@bzip.org
     46   bzip2/libbzip2 version 1.0 of 21 March 2000
     47 
     48   This program is based on (at least) the work of:
     49      Mike Burrows
     50      David Wheeler
     51      Peter Fenwick
     52      Alistair Moffat
     53      Radford Neal
     54      Ian H. Witten
     55      Robert Sedgewick
     56      Jon L. Bentley
     57 
     58   For more information on these sources, see the manual.
     59 
     60   To get some idea how the block sorting algorithms in this file 
     61   work, read my paper 
     62      On the Performance of BWT Sorting Algorithms
     63   in Proceedings of the IEEE Data Compression Conference 2000,
     64   Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
     65   file implements the algorithm called  cache  in the paper.
     66 --*/
     67 
     68 
     69 #include "bzlib_private.h"
     70 
     71 /*---------------------------------------------*/
     72 /*--- Fallback O(N log(N)^2) sorting        ---*/
     73 /*--- algorithm, for repetitive blocks      ---*/
     74 /*---------------------------------------------*/
     75 
     76 /*---------------------------------------------*/
     77 static 
     78 __inline__
     79 void fallbackSimpleSort ( UInt32* fmap, 
     80                           UInt32* eclass, 
     81                           Int32   lo, 
     82                           Int32   hi )
     83 {
     84    Int32 i, j, tmp;
     85    UInt32 ec_tmp;
     86 
     87    if (lo == hi) return;
     88 
     89    if (hi - lo > 3) {
     90       for ( i = hi-4; i >= lo; i-- ) {
     91          tmp = fmap[i];
     92          ec_tmp = eclass[tmp];
     93          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
     94             fmap[j-4] = fmap[j];
     95          fmap[j-4] = tmp;
     96       }
     97    }
     98 
     99    for ( i = hi-1; i >= lo; i-- ) {
    100       tmp = fmap[i];
    101       ec_tmp = eclass[tmp];
    102       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
    103          fmap[j-1] = fmap[j];
    104       fmap[j-1] = tmp;
    105    }
    106 }
    107 
    108 
    109 /*---------------------------------------------*/
    110 #define fswap(zz1, zz2) \
    111    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
    112 
    113 #define fvswap(zzp1, zzp2, zzn)       \
    114 {                                     \
    115    Int32 yyp1 = (zzp1);               \
    116    Int32 yyp2 = (zzp2);               \
    117    Int32 yyn  = (zzn);                \
    118    while (yyn > 0) {                  \
    119       fswap(fmap[yyp1], fmap[yyp2]);  \
    120       yyp1++; yyp2++; yyn--;          \
    121    }                                  \
    122 }
    123 
    124 
    125 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
    126 
    127 #define fpush(lz,hz) { stackLo[sp] = lz; \
    128                        stackHi[sp] = hz; \
    129                        sp++; }
    130 
    131 #define fpop(lz,hz) { sp--;              \
    132                       lz = stackLo[sp];  \
    133                       hz = stackHi[sp]; }
    134 
    135 #define FALLBACK_QSORT_SMALL_THRESH 10
    136 #define FALLBACK_QSORT_STACK_SIZE   100
    137 
    138 
    139 static
    140 void fallbackQSort3 ( UInt32* fmap, 
    141                       UInt32* eclass,
    142                       Int32   loSt, 
    143                       Int32   hiSt )
    144 {
    145    Int32 unLo, unHi, ltLo, gtHi, n, m;
    146    Int32 sp, lo, hi;
    147    UInt32 med, r, r3;
    148    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
    149    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
    150 
    151    r = 0;
    152 
    153    sp = 0;
    154    fpush ( loSt, hiSt );
    155 
    156    while (sp > 0) {
    157 
    158       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
    159 
    160       fpop ( lo, hi );
    161       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
    162          fallbackSimpleSort ( fmap, eclass, lo, hi );
    163          continue;
    164       }
    165 
    166       /* Random partitioning.  Median of 3 sometimes fails to
    167          avoid bad cases.  Median of 9 seems to help but 
    168          looks rather expensive.  This too seems to work but
    169          is cheaper.  Guidance for the magic constants 
    170          7621 and 32768 is taken from Sedgewick's algorithms
    171          book, chapter 35.
    172       */
    173       r = ((r * 7621) + 1) % 32768;
    174       r3 = r % 3;
    175       if (r3 == 0) med = eclass[fmap[lo]]; else
    176       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
    177                    med = eclass[fmap[hi]];
    178 
    179       unLo = ltLo = lo;
    180       unHi = gtHi = hi;
    181 
    182       while (1) {
    183          while (1) {
    184             if (unLo > unHi) break;
    185             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
    186             if (n == 0) { 
    187                fswap(fmap[unLo], fmap[ltLo]); 
    188                ltLo++; unLo++; 
    189                continue; 
    190             };
    191             if (n > 0) break;
    192             unLo++;
    193          }
    194          while (1) {
    195             if (unLo > unHi) break;
    196             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
    197             if (n == 0) { 
    198                fswap(fmap[unHi], fmap[gtHi]); 
    199                gtHi--; unHi--; 
    200                continue; 
    201             };
    202             if (n < 0) break;
    203             unHi--;
    204          }
    205          if (unLo > unHi) break;
    206          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
    207       }
    208 
    209       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
    210 
    211       if (gtHi < ltLo) continue;
    212 
    213       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
    214       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
    215 
    216       n = lo + unLo - ltLo - 1;
    217       m = hi - (gtHi - unHi) + 1;
    218 
    219       if (n - lo > hi - m) {
    220          fpush ( lo, n );
    221          fpush ( m, hi );
    222       } else {
    223          fpush ( m, hi );
    224          fpush ( lo, n );
    225       }
    226    }
    227 }
    228 
    229 #undef fmin
    230 #undef fpush
    231 #undef fpop
    232 #undef fswap
    233 #undef fvswap
    234 #undef FALLBACK_QSORT_SMALL_THRESH
    235 #undef FALLBACK_QSORT_STACK_SIZE
    236 
    237 
    238 /*---------------------------------------------*/
    239 /* Pre:
    240       nblock > 0
    241       eclass exists for [0 .. nblock-1]
    242       ((UChar*)eclass) [0 .. nblock-1] holds block
    243       ptr exists for [0 .. nblock-1]
    244 
    245    Post:
    246       ((UChar*)eclass) [0 .. nblock-1] holds block
    247       All other areas of eclass destroyed
    248       fmap [0 .. nblock-1] holds sorted order
    249       bhtab [ 0 .. 2+(nblock/32) ] destroyed
    250 */
    251 
    252 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
    253 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
    254 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
    255 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
    256 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
    257 
    258 static
    259 void fallbackSort ( UInt32* fmap, 
    260                     UInt32* eclass, 
    261                     UInt32* bhtab,
    262                     Int32   nblock,
    263                     Int32   verb )
    264 {
    265    Int32 ftab[257];
    266    Int32 ftabCopy[256];
    267    Int32 H, i, j, k, l, r, cc, cc1;
    268    Int32 nNotDone;
    269    Int32 nBhtab;
    270    UChar* eclass8 = (UChar*)eclass;
    271 
    272    /*--
    273       Initial 1-char radix sort to generate
    274       initial fmap and initial BH bits.
    275    --*/
    276    if (verb >= 4)
    277       VPrintf0 ( "        bucket sorting ...\n" );
    278    for (i = 0; i < 257;    i++) ftab[i] = 0;
    279    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
    280    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
    281    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
    282 
    283    for (i = 0; i < nblock; i++) {
    284       j = eclass8[i];
    285       k = ftab[j] - 1;
    286       ftab[j] = k;
    287       fmap[k] = i;
    288    }
    289 
    290    nBhtab = 2 + (nblock / 32);
    291    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
    292    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
    293 
    294    /*--
    295       Inductively refine the buckets.  Kind-of an
    296       "exponential radix sort" (!), inspired by the
    297       Manber-Myers suffix array construction algorithm.
    298    --*/
    299 
    300    /*-- set sentinel bits for block-end detection --*/
    301    for (i = 0; i < 32; i++) { 
    302       SET_BH(nblock + 2*i);
    303       CLEAR_BH(nblock + 2*i + 1);
    304    }
    305 
    306    /*-- the log(N) loop --*/
    307    H = 1;
    308    while (1) {
    309 
    310       if (verb >= 4) 
    311          VPrintf1 ( "        depth %6d has ", H );
    312 
    313       j = 0;
    314       for (i = 0; i < nblock; i++) {
    315          if (ISSET_BH(i)) j = i;
    316          k = fmap[i] - H; if (k < 0) k += nblock;
    317          eclass[k] = j;
    318       }
    319 
    320       nNotDone = 0;
    321       r = -1;
    322       while (1) {
    323 
    324 	 /*-- find the next non-singleton bucket --*/
    325          k = r + 1;
    326          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
    327          if (ISSET_BH(k)) {
    328             while (WORD_BH(k) == 0xffffffff) k += 32;
    329             while (ISSET_BH(k)) k++;
    330          }
    331          l = k - 1;
    332          if (l >= nblock) break;
    333          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
    334          if (!ISSET_BH(k)) {
    335             while (WORD_BH(k) == 0x00000000) k += 32;
    336             while (!ISSET_BH(k)) k++;
    337          }
    338          r = k - 1;
    339          if (r >= nblock) break;
    340 
    341          /*-- now [l, r] bracket current bucket --*/
    342          if (r > l) {
    343             nNotDone += (r - l + 1);
    344             fallbackQSort3 ( fmap, eclass, l, r );
    345 
    346             /*-- scan bucket and generate header bits-- */
    347             cc = -1;
    348             for (i = l; i <= r; i++) {
    349                cc1 = eclass[fmap[i]];
    350                if (cc != cc1) { SET_BH(i); cc = cc1; };
    351             }
    352          }
    353       }
    354 
    355       if (verb >= 4) 
    356          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
    357 
    358       H *= 2;
    359       if (H > nblock || nNotDone == 0) break;
    360    }
    361 
    362    /*-- 
    363       Reconstruct the original block in
    364       eclass8 [0 .. nblock-1], since the
    365       previous phase destroyed it.
    366    --*/
    367    if (verb >= 4)
    368       VPrintf0 ( "        reconstructing block ...\n" );
    369    j = 0;
    370    for (i = 0; i < nblock; i++) {
    371       while (ftabCopy[j] == 0) j++;
    372       ftabCopy[j]--;
    373       eclass8[fmap[i]] = (UChar)j;
    374    }
    375    AssertH ( j < 256, 1005 );
    376 }
    377 
    378 #undef       SET_BH
    379 #undef     CLEAR_BH
    380 #undef     ISSET_BH
    381 #undef      WORD_BH
    382 #undef UNALIGNED_BH
    383 
    384 
    385 /*---------------------------------------------*/
    386 /*--- The main, O(N^2 log(N)) sorting       ---*/
    387 /*--- algorithm.  Faster for "normal"       ---*/
    388 /*--- non-repetitive blocks.                ---*/
    389 /*---------------------------------------------*/
    390 
    391 /*---------------------------------------------*/
    392 static
    393 __inline__
    394 Bool mainGtU ( UInt32  i1, 
    395                UInt32  i2,
    396                UChar*  block, 
    397                UInt16* quadrant,
    398                UInt32  nblock,
    399                Int32*  budget )
    400 {
    401    Int32  k;
    402    UChar  c1, c2;
    403    UInt16 s1, s2;
    404 
    405    AssertD ( i1 != i2, "mainGtU" );
    406    /* 1 */
    407    c1 = block[i1]; c2 = block[i2];
    408    if (c1 != c2) return (c1 > c2);
    409    i1++; i2++;
    410    /* 2 */
    411    c1 = block[i1]; c2 = block[i2];
    412    if (c1 != c2) return (c1 > c2);
    413    i1++; i2++;
    414    /* 3 */
    415    c1 = block[i1]; c2 = block[i2];
    416    if (c1 != c2) return (c1 > c2);
    417    i1++; i2++;
    418    /* 4 */
    419    c1 = block[i1]; c2 = block[i2];
    420    if (c1 != c2) return (c1 > c2);
    421    i1++; i2++;
    422    /* 5 */
    423    c1 = block[i1]; c2 = block[i2];
    424    if (c1 != c2) return (c1 > c2);
    425    i1++; i2++;
    426    /* 6 */
    427    c1 = block[i1]; c2 = block[i2];
    428    if (c1 != c2) return (c1 > c2);
    429    i1++; i2++;
    430    /* 7 */
    431    c1 = block[i1]; c2 = block[i2];
    432    if (c1 != c2) return (c1 > c2);
    433    i1++; i2++;
    434    /* 8 */
    435    c1 = block[i1]; c2 = block[i2];
    436    if (c1 != c2) return (c1 > c2);
    437    i1++; i2++;
    438    /* 9 */
    439    c1 = block[i1]; c2 = block[i2];
    440    if (c1 != c2) return (c1 > c2);
    441    i1++; i2++;
    442    /* 10 */
    443    c1 = block[i1]; c2 = block[i2];
    444    if (c1 != c2) return (c1 > c2);
    445    i1++; i2++;
    446    /* 11 */
    447    c1 = block[i1]; c2 = block[i2];
    448    if (c1 != c2) return (c1 > c2);
    449    i1++; i2++;
    450    /* 12 */
    451    c1 = block[i1]; c2 = block[i2];
    452    if (c1 != c2) return (c1 > c2);
    453    i1++; i2++;
    454 
    455    k = nblock + 8;
    456 
    457    do {
    458       /* 1 */
    459       c1 = block[i1]; c2 = block[i2];
    460       if (c1 != c2) return (c1 > c2);
    461       s1 = quadrant[i1]; s2 = quadrant[i2];
    462       if (s1 != s2) return (s1 > s2);
    463       i1++; i2++;
    464       /* 2 */
    465       c1 = block[i1]; c2 = block[i2];
    466       if (c1 != c2) return (c1 > c2);
    467       s1 = quadrant[i1]; s2 = quadrant[i2];
    468       if (s1 != s2) return (s1 > s2);
    469       i1++; i2++;
    470       /* 3 */
    471       c1 = block[i1]; c2 = block[i2];
    472       if (c1 != c2) return (c1 > c2);
    473       s1 = quadrant[i1]; s2 = quadrant[i2];
    474       if (s1 != s2) return (s1 > s2);
    475       i1++; i2++;
    476       /* 4 */
    477       c1 = block[i1]; c2 = block[i2];
    478       if (c1 != c2) return (c1 > c2);
    479       s1 = quadrant[i1]; s2 = quadrant[i2];
    480       if (s1 != s2) return (s1 > s2);
    481       i1++; i2++;
    482       /* 5 */
    483       c1 = block[i1]; c2 = block[i2];
    484       if (c1 != c2) return (c1 > c2);
    485       s1 = quadrant[i1]; s2 = quadrant[i2];
    486       if (s1 != s2) return (s1 > s2);
    487       i1++; i2++;
    488       /* 6 */
    489       c1 = block[i1]; c2 = block[i2];
    490       if (c1 != c2) return (c1 > c2);
    491       s1 = quadrant[i1]; s2 = quadrant[i2];
    492       if (s1 != s2) return (s1 > s2);
    493       i1++; i2++;
    494       /* 7 */
    495       c1 = block[i1]; c2 = block[i2];
    496       if (c1 != c2) return (c1 > c2);
    497       s1 = quadrant[i1]; s2 = quadrant[i2];
    498       if (s1 != s2) return (s1 > s2);
    499       i1++; i2++;
    500       /* 8 */
    501       c1 = block[i1]; c2 = block[i2];
    502       if (c1 != c2) return (c1 > c2);
    503       s1 = quadrant[i1]; s2 = quadrant[i2];
    504       if (s1 != s2) return (s1 > s2);
    505       i1++; i2++;
    506 
    507       if (i1 >= nblock) i1 -= nblock;
    508       if (i2 >= nblock) i2 -= nblock;
    509 
    510       k -= 8;
    511       (*budget)--;
    512    }
    513       while (k >= 0);
    514 
    515    return False;
    516 }
    517 
    518 
    519 /*---------------------------------------------*/
    520 /*--
    521    Knuth's increments seem to work better
    522    than Incerpi-Sedgewick here.  Possibly
    523    because the number of elems to sort is
    524    usually small, typically <= 20.
    525 --*/
    526 static
    527 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
    528                    9841, 29524, 88573, 265720,
    529                    797161, 2391484 };
    530 
    531 static
    532 void mainSimpleSort ( UInt32* ptr,
    533                       UChar*  block,
    534                       UInt16* quadrant,
    535                       Int32   nblock,
    536                       Int32   lo, 
    537                       Int32   hi, 
    538                       Int32   d,
    539                       Int32*  budget )
    540 {
    541    Int32 i, j, h, bigN, hp;
    542    UInt32 v;
    543 
    544    bigN = hi - lo + 1;
    545    if (bigN < 2) return;
    546 
    547    hp = 0;
    548    while (incs[hp] < bigN) hp++;
    549    hp--;
    550 
    551    for (; hp >= 0; hp--) {
    552       h = incs[hp];
    553 
    554       i = lo + h;
    555       while (True) {
    556 
    557          /*-- copy 1 --*/
    558          if (i > hi) break;
    559          v = ptr[i];
    560          j = i;
    561          while ( mainGtU ( 
    562                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
    563                  ) ) {
    564             ptr[j] = ptr[j-h];
    565             j = j - h;
    566             if (j <= (lo + h - 1)) break;
    567          }
    568          ptr[j] = v;
    569          i++;
    570 
    571          /*-- copy 2 --*/
    572          if (i > hi) break;
    573          v = ptr[i];
    574          j = i;
    575          while ( mainGtU ( 
    576                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
    577                  ) ) {
    578             ptr[j] = ptr[j-h];
    579             j = j - h;
    580             if (j <= (lo + h - 1)) break;
    581          }
    582          ptr[j] = v;
    583          i++;
    584 
    585          /*-- copy 3 --*/
    586          if (i > hi) break;
    587          v = ptr[i];
    588          j = i;
    589          while ( mainGtU ( 
    590                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
    591                  ) ) {
    592             ptr[j] = ptr[j-h];
    593             j = j - h;
    594             if (j <= (lo + h - 1)) break;
    595          }
    596          ptr[j] = v;
    597          i++;
    598 
    599          if (*budget < 0) return;
    600       }
    601    }
    602 }
    603 
    604 
    605 /*---------------------------------------------*/
    606 /*--
    607    The following is an implementation of
    608    an elegant 3-way quicksort for strings,
    609    described in a paper "Fast Algorithms for
    610    Sorting and Searching Strings", by Robert
    611    Sedgewick and Jon L. Bentley.
    612 --*/
    613 
    614 #define mswap(zz1, zz2) \
    615    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
    616 
    617 #define mvswap(zzp1, zzp2, zzn)       \
    618 {                                     \
    619    Int32 yyp1 = (zzp1);               \
    620    Int32 yyp2 = (zzp2);               \
    621    Int32 yyn  = (zzn);                \
    622    while (yyn > 0) {                  \
    623       mswap(ptr[yyp1], ptr[yyp2]);    \
    624       yyp1++; yyp2++; yyn--;          \
    625    }                                  \
    626 }
    627 
    628 static 
    629 __inline__
    630 UChar mmed3 ( UChar a, UChar b, UChar c )
    631 {
    632    UChar t;
    633    if (a > b) { t = a; a = b; b = t; };
    634    if (b > c) { 
    635       b = c;
    636       if (a > b) b = a;
    637    }
    638    return b;
    639 }
    640 
    641 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
    642 
    643 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
    644                           stackHi[sp] = hz; \
    645                           stackD [sp] = dz; \
    646                           sp++; }
    647 
    648 #define mpop(lz,hz,dz) { sp--;             \
    649                          lz = stackLo[sp]; \
    650                          hz = stackHi[sp]; \
    651                          dz = stackD [sp]; }
    652 
    653 
    654 #define mnextsize(az) (nextHi[az]-nextLo[az])
    655 
    656 #define mnextswap(az,bz)                                        \
    657    { Int32 tz;                                                  \
    658      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
    659      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
    660      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
    661 
    662 
    663 #define MAIN_QSORT_SMALL_THRESH 20
    664 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
    665 #define MAIN_QSORT_STACK_SIZE 100
    666 
    667 static
    668 void mainQSort3 ( UInt32* ptr,
    669                   UChar*  block,
    670                   UInt16* quadrant,
    671                   Int32   nblock,
    672                   Int32   loSt, 
    673                   Int32   hiSt, 
    674                   Int32   dSt,
    675                   Int32*  budget )
    676 {
    677    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
    678    Int32 sp, lo, hi, d;
    679 
    680    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
    681    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
    682    Int32 stackD [MAIN_QSORT_STACK_SIZE];
    683 
    684    Int32 nextLo[3];
    685    Int32 nextHi[3];
    686    Int32 nextD [3];
    687 
    688    sp = 0;
    689    mpush ( loSt, hiSt, dSt );
    690 
    691    while (sp > 0) {
    692 
    693       AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
    694 
    695       mpop ( lo, hi, d );
    696       if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
    697           d > MAIN_QSORT_DEPTH_THRESH) {
    698          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
    699          if (*budget < 0) return;
    700          continue;
    701       }
    702 
    703       med = (Int32) 
    704             mmed3 ( block[ptr[ lo         ]+d],
    705                     block[ptr[ hi         ]+d],
    706                     block[ptr[ (lo+hi)>>1 ]+d] );
    707 
    708       unLo = ltLo = lo;
    709       unHi = gtHi = hi;
    710 
    711       while (True) {
    712          while (True) {
    713             if (unLo > unHi) break;
    714             n = ((Int32)block[ptr[unLo]+d]) - med;
    715             if (n == 0) { 
    716                mswap(ptr[unLo], ptr[ltLo]); 
    717                ltLo++; unLo++; continue; 
    718             };
    719             if (n >  0) break;
    720             unLo++;
    721          }
    722          while (True) {
    723             if (unLo > unHi) break;
    724             n = ((Int32)block[ptr[unHi]+d]) - med;
    725             if (n == 0) { 
    726                mswap(ptr[unHi], ptr[gtHi]); 
    727                gtHi--; unHi--; continue; 
    728             };
    729             if (n <  0) break;
    730             unHi--;
    731          }
    732          if (unLo > unHi) break;
    733          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
    734       }
    735 
    736       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
    737 
    738       if (gtHi < ltLo) {
    739          mpush(lo, hi, d+1 );
    740          continue;
    741       }
    742 
    743       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
    744       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
    745 
    746       n = lo + unLo - ltLo - 1;
    747       m = hi - (gtHi - unHi) + 1;
    748 
    749       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
    750       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
    751       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
    752 
    753       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
    754       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
    755       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
    756 
    757       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
    758       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
    759 
    760       mpush (nextLo[0], nextHi[0], nextD[0]);
    761       mpush (nextLo[1], nextHi[1], nextD[1]);
    762       mpush (nextLo[2], nextHi[2], nextD[2]);
    763    }
    764 }
    765 
    766 #undef mswap
    767 #undef mvswap
    768 #undef mpush
    769 #undef mpop
    770 #undef mmin
    771 #undef mnextsize
    772 #undef mnextswap
    773 #undef MAIN_QSORT_SMALL_THRESH
    774 #undef MAIN_QSORT_DEPTH_THRESH
    775 #undef MAIN_QSORT_STACK_SIZE
    776 
    777 
    778 /*---------------------------------------------*/
    779 /* Pre:
    780       nblock > N_OVERSHOOT
    781       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
    782       ((UChar*)block32) [0 .. nblock-1] holds block
    783       ptr exists for [0 .. nblock-1]
    784 
    785    Post:
    786       ((UChar*)block32) [0 .. nblock-1] holds block
    787       All other areas of block32 destroyed
    788       ftab [0 .. 65536 ] destroyed
    789       ptr [0 .. nblock-1] holds sorted order
    790       if (*budget < 0), sorting was abandoned
    791 */
    792 
    793 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
    794 #define SETMASK (1 << 21)
    795 #define CLEARMASK (~(SETMASK))
    796 
    797 static
    798 void mainSort ( UInt32* ptr, 
    799                 UChar*  block,
    800                 UInt16* quadrant, 
    801                 UInt32* ftab,
    802                 Int32   nblock,
    803                 Int32   verb,
    804                 Int32*  budget )
    805 {
    806    Int32  i, j, k, ss, sb;
    807    Int32  runningOrder[256];
    808    Bool   bigDone[256];
    809    Int32  copyStart[256];
    810    Int32  copyEnd  [256];
    811    UChar  c1;
    812    Int32  numQSorted;
    813    UInt16 s;
    814    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
    815 
    816    /*-- set up the 2-byte frequency table --*/
    817    for (i = 65536; i >= 0; i--) ftab[i] = 0;
    818 
    819    j = block[0] << 8;
    820    i = nblock-1;
    821    for (; i >= 3; i -= 4) {
    822       quadrant[i] = 0;
    823       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
    824       ftab[j]++;
    825       quadrant[i-1] = 0;
    826       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
    827       ftab[j]++;
    828       quadrant[i-2] = 0;
    829       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
    830       ftab[j]++;
    831       quadrant[i-3] = 0;
    832       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
    833       ftab[j]++;
    834    }
    835    for (; i >= 0; i--) {
    836       quadrant[i] = 0;
    837       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
    838       ftab[j]++;
    839    }
    840 
    841    /*-- (emphasises close relationship of block & quadrant) --*/
    842    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
    843       block   [nblock+i] = block[i];
    844       quadrant[nblock+i] = 0;
    845    }
    846 
    847    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
    848 
    849    /*-- Complete the initial radix sort --*/
    850    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
    851 
    852    s = block[0] << 8;
    853    i = nblock-1;
    854    for (; i >= 3; i -= 4) {
    855       s = (s >> 8) | (block[i] << 8);
    856       j = ftab[s] -1;
    857       ftab[s] = j;
    858       ptr[j] = i;
    859       s = (s >> 8) | (block[i-1] << 8);
    860       j = ftab[s] -1;
    861       ftab[s] = j;
    862       ptr[j] = i-1;
    863       s = (s >> 8) | (block[i-2] << 8);
    864       j = ftab[s] -1;
    865       ftab[s] = j;
    866       ptr[j] = i-2;
    867       s = (s >> 8) | (block[i-3] << 8);
    868       j = ftab[s] -1;
    869       ftab[s] = j;
    870       ptr[j] = i-3;
    871    }
    872    for (; i >= 0; i--) {
    873       s = (s >> 8) | (block[i] << 8);
    874       j = ftab[s] -1;
    875       ftab[s] = j;
    876       ptr[j] = i;
    877    }
    878 
    879    /*--
    880       Now ftab contains the first loc of every small bucket.
    881       Calculate the running order, from smallest to largest
    882       big bucket.
    883    --*/
    884    for (i = 0; i <= 255; i++) {
    885       bigDone     [i] = False;
    886       runningOrder[i] = i;
    887    }
    888 
    889    {
    890       Int32 vv;
    891       Int32 h = 1;
    892       do h = 3 * h + 1; while (h <= 256);
    893       do {
    894          h = h / 3;
    895          for (i = h; i <= 255; i++) {
    896             vv = runningOrder[i];
    897             j = i;
    898             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
    899                runningOrder[j] = runningOrder[j-h];
    900                j = j - h;
    901                if (j <= (h - 1)) goto zero;
    902             }
    903             zero:
    904             runningOrder[j] = vv;
    905          }
    906       } while (h != 1);
    907    }
    908 
    909    /*--
    910       The main sorting loop.
    911    --*/
    912 
    913    numQSorted = 0;
    914 
    915    for (i = 0; i <= 255; i++) {
    916 
    917       /*--
    918          Process big buckets, starting with the least full.
    919          Basically this is a 3-step process in which we call
    920          mainQSort3 to sort the small buckets [ss, j], but
    921          also make a big effort to avoid the calls if we can.
    922       --*/
    923       ss = runningOrder[i];
    924 
    925       /*--
    926          Step 1:
    927          Complete the big bucket [ss] by quicksorting
    928          any unsorted small buckets [ss, j], for j != ss.  
    929          Hopefully previous pointer-scanning phases have already
    930          completed many of the small buckets [ss, j], so
    931          we don't have to sort them at all.
    932       --*/
    933       for (j = 0; j <= 255; j++) {
    934          if (j != ss) {
    935             sb = (ss << 8) + j;
    936             if ( ! (ftab[sb] & SETMASK) ) {
    937                Int32 lo = ftab[sb]   & CLEARMASK;
    938                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
    939                if (hi > lo) {
    940                   if (verb >= 4)
    941                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
    942                                 "done %d   this %d\n",
    943                                 ss, j, numQSorted, hi - lo + 1 );
    944                   mainQSort3 ( 
    945                      ptr, block, quadrant, nblock, 
    946                      lo, hi, BZ_N_RADIX, budget 
    947                   );   
    948                   numQSorted += (hi - lo + 1);
    949                   if (*budget < 0) return;
    950                }
    951             }
    952             ftab[sb] |= SETMASK;
    953          }
    954       }
    955 
    956       AssertH ( !bigDone[ss], 1006 );
    957 
    958       /*--
    959          Step 2:
    960          Now scan this big bucket [ss] so as to synthesise the
    961          sorted order for small buckets [t, ss] for all t,
    962          including, magically, the bucket [ss,ss] too.
    963          This will avoid doing Real Work in subsequent Step 1's.
    964       --*/
    965       {
    966          for (j = 0; j <= 255; j++) {
    967             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
    968             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
    969          }
    970          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
    971             k = ptr[j]-1; if (k < 0) k += nblock;
    972             c1 = block[k];
    973             if (!bigDone[c1])
    974                ptr[ copyStart[c1]++ ] = k;
    975          }
    976          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
    977             k = ptr[j]-1; if (k < 0) k += nblock;
    978             c1 = block[k];
    979             if (!bigDone[c1]) 
    980                ptr[ copyEnd[c1]-- ] = k;
    981          }
    982       }
    983 
    984       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
    985                 || 
    986                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
    987                    Necessity for this case is demonstrated by compressing 
    988                    a sequence of approximately 48.5 million of character 
    989                    251; 1.0.0/1.0.1 will then die here. */
    990                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
    991                 1007 )
    992 
    993       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
    994 
    995       /*--
    996          Step 3:
    997          The [ss] big bucket is now done.  Record this fact,
    998          and update the quadrant descriptors.  Remember to
    999          update quadrants in the overshoot area too, if
   1000          necessary.  The "if (i < 255)" test merely skips
   1001          this updating for the last bucket processed, since
   1002          updating for the last bucket is pointless.
   1003 
   1004          The quadrant array provides a way to incrementally
   1005          cache sort orderings, as they appear, so as to 
   1006          make subsequent comparisons in fullGtU() complete
   1007          faster.  For repetitive blocks this makes a big
   1008          difference (but not big enough to be able to avoid
   1009          the fallback sorting mechanism, exponential radix sort).
   1010 
   1011          The precise meaning is: at all times:
   1012 
   1013             for 0 <= i < nblock and 0 <= j <= nblock
   1014 
   1015             if block[i] != block[j], 
   1016 
   1017                then the relative values of quadrant[i] and 
   1018                     quadrant[j] are meaningless.
   1019 
   1020                else {
   1021                   if quadrant[i] < quadrant[j]
   1022                      then the string starting at i lexicographically
   1023                      precedes the string starting at j
   1024 
   1025                   else if quadrant[i] > quadrant[j]
   1026                      then the string starting at j lexicographically
   1027                      precedes the string starting at i
   1028 
   1029                   else
   1030                      the relative ordering of the strings starting
   1031                      at i and j has not yet been determined.
   1032                }
   1033       --*/
   1034       bigDone[ss] = True;
   1035 
   1036       if (i < 255) {
   1037          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
   1038          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
   1039          Int32 shifts   = 0;
   1040 
   1041          while ((bbSize >> shifts) > 65534) shifts++;
   1042 
   1043          for (j = bbSize-1; j >= 0; j--) {
   1044             Int32 a2update     = ptr[bbStart + j];
   1045             UInt16 qVal        = (UInt16)(j >> shifts);
   1046             quadrant[a2update] = qVal;
   1047             if (a2update < BZ_N_OVERSHOOT)
   1048                quadrant[a2update + nblock] = qVal;
   1049          }
   1050          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
   1051       }
   1052 
   1053    }
   1054 
   1055    if (verb >= 4)
   1056       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
   1057                  nblock, numQSorted, nblock - numQSorted );
   1058 }
   1059 
   1060 #undef BIGFREQ
   1061 #undef SETMASK
   1062 #undef CLEARMASK
   1063 
   1064 
   1065 /*---------------------------------------------*/
   1066 /* Pre:
   1067       nblock > 0
   1068       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
   1069       ((UChar*)arr2)  [0 .. nblock-1] holds block
   1070       arr1 exists for [0 .. nblock-1]
   1071 
   1072    Post:
   1073       ((UChar*)arr2) [0 .. nblock-1] holds block
   1074       All other areas of block destroyed
   1075       ftab [ 0 .. 65536 ] destroyed
   1076       arr1 [0 .. nblock-1] holds sorted order
   1077 */
   1078 void BZ2_blockSort ( EState* s )
   1079 {
   1080    UInt32* ptr    = s->ptr; 
   1081    UChar*  block  = s->block;
   1082    UInt32* ftab   = s->ftab;
   1083    Int32   nblock = s->nblock;
   1084    Int32   verb   = s->verbosity;
   1085    Int32   wfact  = s->workFactor;
   1086    UInt16* quadrant;
   1087    Int32   budget;
   1088    Int32   budgetInit;
   1089    Int32   i;
   1090 
   1091    if (nblock < 10000) {
   1092       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
   1093    } else {
   1094       /* Calculate the location for quadrant, remembering to get
   1095          the alignment right.  Assumes that &(block[0]) is at least
   1096          2-byte aligned -- this should be ok since block is really
   1097          the first section of arr2.
   1098       */
   1099       i = nblock+BZ_N_OVERSHOOT;
   1100       if (i & 1) i++;
   1101       quadrant = (UInt16*)(&(block[i]));
   1102 
   1103       /* (wfact-1) / 3 puts the default-factor-30
   1104          transition point at very roughly the same place as 
   1105          with v0.1 and v0.9.0.  
   1106          Not that it particularly matters any more, since the
   1107          resulting compressed stream is now the same regardless
   1108          of whether or not we use the main sort or fallback sort.
   1109       */
   1110       if (wfact < 1  ) wfact = 1;
   1111       if (wfact > 100) wfact = 100;
   1112       budgetInit = nblock * ((wfact-1) / 3);
   1113       budget = budgetInit;
   1114 
   1115       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
   1116       if (verb >= 3) 
   1117          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
   1118                     budgetInit - budget,
   1119                     nblock, 
   1120                     (float)(budgetInit - budget) /
   1121                     (float)(nblock==0 ? 1 : nblock) ); 
   1122       if (budget < 0) {
   1123          if (verb >= 2) 
   1124             VPrintf0 ( "    too repetitive; using fallback"
   1125                        " sorting algorithm\n" );
   1126          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
   1127       }
   1128    }
   1129 
   1130    s->origPtr = -1;
   1131    for (i = 0; i < s->nblock; i++)
   1132       if (ptr[i] == 0)
   1133          { s->origPtr = i; break; };
   1134 
   1135    AssertH( s->origPtr != -1, 1003 );
   1136 }
   1137 
   1138 
   1139 /*-------------------------------------------------------------*/
   1140 /*--- end                                       blocksort.c ---*/
   1141 /*-------------------------------------------------------------*/