CMake/Utilities/cmcompress/cmcompress.c

/*
 * Copyright (c) 1985, 1986 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * James A. Woods, derived from original work by Spencer Thomas
 * and Joseph Orost.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *  This product includes software developed by the University of
 *  California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "cmcompress.h"

#include <errno.h>
#include <string.h>

static const char_type magic_header[] = { "\037\235" };  /* 1F 9D */

/* Defines for third byte of header */
#define BIT_MASK  0x1f
#define BLOCK_MASK  0x80
#define CHECK_GAP 10000  /* ratio check interval */
/* Masks 0x40 and 0x20 are free.  I think 0x20 should mean that there is
   a fourth header byte (for expansion).
   */
#define INIT_BITS 9      /* initial number of bits/code */

#ifdef COMPATIBLE    /* But wrong! */
# define MAXCODE(n_bits)  (1 << (n_bits) - 1)
#else
# define MAXCODE(n_bits)  ((1 << (n_bits)) - 1)
#endif /* COMPATIBLE */

#define htabof(i)  cdata->htab[i]
#define codetabof(i)  cdata->codetab[i]

/*
 * the next two codes should not be changed lightly, as they must not
 * lie within the contiguous general code space.
 */
#define FIRST  257  /* first free entry */
#define  CLEAR  256  /* table clear output code */

#ifdef DEBUG
static void prratio( FILE *stream, long int num, long int den);
#endif

int cmcompress_compress_initialize(struct cmcompress_stream* cdata)
{
  cdata->maxbits = BITS;      /* user settable max # bits/code */
  cdata->maxmaxcode = 1 << BITS;  /* should NEVER generate this code */
  cdata->hsize = HSIZE;      /* for dynamic table sizing */
  cdata->free_ent = 0;      /* first unused entry */
  cdata->nomagic = 0;  /* Use a 3-byte magic number header, unless old file */
  cdata->block_compress = BLOCK_MASK;
  cdata->clear_flg = 0;
  cdata->ratio = 0;
  cdata->checkpoint = CHECK_GAP;

  cdata->input_stream = 0;
  cdata->output_stream = 0;
  cdata->client_data = 0;
  return 1;
}

static void cl_hash(struct cmcompress_stream* cdata, count_int hsize)    /* reset code table */
{
  register count_int *htab_p = cdata->htab+hsize;
  register long i;
  register long m1 = -1;

  i = hsize - 16;
  do
    {        /* might use Sys V memset(3) here */
    *(htab_p-16) = m1;
    *(htab_p-15) = m1;
    *(htab_p-14) = m1;
    *(htab_p-13) = m1;
    *(htab_p-12) = m1;
    *(htab_p-11) = m1;
    *(htab_p-10) = m1;
    *(htab_p-9) = m1;
    *(htab_p-8) = m1;
    *(htab_p-7) = m1;
    *(htab_p-6) = m1;
    *(htab_p-5) = m1;
    *(htab_p-4) = m1;
    *(htab_p-3) = m1;
    *(htab_p-2) = m1;
    *(htab_p-1) = m1;
    htab_p -= 16;
    }
  while ((i -= 16) >= 0);
  for ( i += 16; i > 0; i-- )
    {
    *--htab_p = m1;
    }
}

/*-
 * Output the given code.
 * Inputs:
 *   code:  A n_bits-bit integer.  If == -1, then EOF.  This assumes
 *    that n_bits =< (long)wordsize - 1.
 * Outputs:
 *   Outputs code to the file.
 * Assumptions:
 *  Chars are 8 bits long.
 * Algorithm:
 *   Maintain a BITS character long buffer (so that 8 codes will
 * fit in it exactly).  Use the VAX insv instruction to insert each
 * code in turn.  When the buffer fills up empty it and start over.
 */

static char buf[BITS];

#ifndef vax
char_type lmask[9] = {0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00};
char_type rmask[9] = {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
#endif /* vax */

static int output(struct cmcompress_stream* cdata, code_int  code)
{
#ifdef DEBUG
  static int col = 0;
#endif /* DEBUG */

  /*
   * On the VAX, it is important to have the register declarations
   * in exactly the order given, or the asm will break.
   */
  register int r_off = cdata->offset, bits= cdata->n_bits;
  register char * bp = buf;

#ifdef DEBUG
  if ( verbose )
    {
    fprintf( stderr, "%5d%c", code,
      (col+=6) >= 74 ? (col = 0, '\n') : ' ' );
    }
#endif /* DEBUG */
  if ( code >= 0 )
    {
#if defined(vax) && !defined(__GNUC__)
    /*
     * VAX and PCC DEPENDENT!! Implementation on other machines is
     * below.
     *
     * Translation: Insert BITS bits from the argument starting at
     * cdata->offset bits from the beginning of buf.
     */
    0;  /* Work around for pcc -O bug with asm and if stmt */
    asm( "insv  4(ap),r11,r10,(r9)" );
#else
    /*
     * byte/bit numbering on the VAX is simulated by the following code
     */
    /*
     * Get to the first byte.
     */
    bp += (r_off >> 3);
    r_off &= 7;
    /*
     * Since code is always >= 8 bits, only need to mask the first
     * hunk on the left.
     */
    *bp = (char)((*bp & rmask[r_off]) | ((code << r_off) & lmask[r_off]));
    bp++;
    bits -= (8 - r_off);
    code >>= 8 - r_off;
    /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
    if ( bits >= 8 )
      {
      *bp++ = (char)(code);
      code >>= 8;
      bits -= 8;
      }
    /* Last bits. */
    if(bits)
      {
      *bp = (char)(code);
      }
#endif /* vax */
    cdata->offset += cdata->n_bits;
    if ( cdata->offset == (cdata->n_bits << 3) )
      {
      bp = buf;
      bits = cdata->n_bits;
      cdata->bytes_out += bits;
      do
        {
        if ( cdata->output_stream(cdata, bp, 1) != 1 )
          {
          return 0;
          }
        bp++;
        }
      while(--bits);
      cdata->offset = 0;
      }

    /*
     * If the next entry is going to be too big for the code size,
     * then increase it, if possible.
     */
    if ( cdata->free_ent > cdata->maxcode || (cdata->clear_flg > 0))
      {
      /*
       * Write the whole buffer, because the input side won't
       * discover the size increase until after it has read it.
       */
      if ( cdata->offset > 0 )
        {
        if ( cdata->output_stream(cdata, buf, cdata->n_bits) != cdata->n_bits )
          {
          return 0;
          }
        cdata->bytes_out += cdata->n_bits;
        }
      cdata->offset = 0;

      if ( cdata->clear_flg )
        {
        cdata->maxcode = MAXCODE (cdata->n_bits = INIT_BITS);
        cdata->clear_flg = 0;
        }
      else
        {
        cdata->n_bits++;
        if ( cdata->n_bits == cdata->maxbits )
          {
          cdata->maxcode = cdata->maxmaxcode;
          }
        else
          {
          cdata->maxcode = MAXCODE(cdata->n_bits);
          }
        }
#ifdef DEBUG
      if ( debug )
        {
        fprintf( stderr, "\nChange to %d bits\n", cdata->n_bits );
        col = 0;
        }
#endif /* DEBUG */
      }
    }
  else
    {
    /*
     * At EOF, write the rest of the buffer.
     */
    if ( cdata->offset > 0 )
      {
      cdata->offset = (cdata->offset + 7) / 8;
      if ( cdata->output_stream(cdata, buf, cdata->offset ) != cdata->offset )
        {
        return 0;
        }
      cdata->bytes_out += cdata->offset;
      }
    cdata->offset = 0;
    (void)fflush( stdout );
    if( ferror( stdout ) )
      {
      return 0;
      }
#ifdef DEBUG
    if ( verbose )
      {
      fprintf( stderr, "\n" );
      }
#endif
    }
  return 1;
}

/*
 * compress stdin to stdout
 *
 * Algorithm:  use open addressing double hashing (no chaining) on the
 * prefix code / next character combination.  We do a variant of Knuth's
 * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
 * secondary probe.  Here, the modular division first probe is gives way
 * to a faster exclusive-or manipulation.  Also do block compression with
 * an adaptive reset, whereby the code table is cleared when the compression
 * ratio decreases, but after the table fills.  The variable-length output
 * codes are re-sized at this point, and a special CLEAR code is generated
 * for the decompressor.  Late addition:  construct the table according to
 * file size for noticeable speed improvement on small files.  Please direct
 * questions about this implementation to ames!jaw.
 */

int cmcompress_compress_start(struct cmcompress_stream* cdata)
{
#ifndef COMPATIBLE
  if (cdata->nomagic == 0)
    {
    char headLast = (char)(cdata->maxbits | cdata->block_compress);
    cdata->output_stream(cdata, (const char*)magic_header, 2);
    cdata->output_stream(cdata, &headLast, 1);
    if(ferror(stdout))
      {
      printf("Error...\n");
      }
    }
#endif /* COMPATIBLE */

  cdata->offset = 0;
  cdata->bytes_out = 3;    /* includes 3-byte header mojo */
  cdata->out_count = 0;
  cdata->clear_flg = 0;
  cdata->ratio = 0;
  cdata->in_count = 1;
  cdata->checkpoint = CHECK_GAP;
  cdata->maxcode = MAXCODE(cdata->n_bits = INIT_BITS);
  cdata->free_ent = ((cdata->block_compress) ? FIRST : 256 );

  cdata->first_pass = 1;

  cdata->hshift = 0;
  for ( cdata->fcode = (long) cdata->hsize;  cdata->fcode < 65536L; cdata->fcode *= 2L )
    {
    cdata->hshift++;
    }
  cdata->hshift = 8 - cdata->hshift;    /* set hash code range bound */

  cdata->hsize_reg = cdata->hsize;
  cl_hash(cdata, (count_int) cdata->hsize_reg);    /* clear hash table */

  return 1;
}

static int cl_block (struct cmcompress_stream* cdata)    /* table clear for block compress */
{
  register long int rat;

  cdata->checkpoint = cdata->in_count + CHECK_GAP;
#ifdef DEBUG
  if ( cdata->debug )
    {
    fprintf ( stderr, "count: %ld, ratio: ", cdata->in_count );
    prratio ( stderr, cdata->in_count, cdata->bytes_out );
    fprintf ( stderr, "\n");
    }
#endif /* DEBUG */

  if(cdata->in_count > 0x007fffff)
    {  /* shift will overflow */
    rat = cdata->bytes_out >> 8;
    if(rat == 0)
      {    /* Don't divide by zero */
      rat = 0x7fffffff;
      }
    else
      {
      rat = cdata->in_count / rat;
      }
    }
  else
    {
    rat = (cdata->in_count << 8) / cdata->bytes_out;  /* 8 fractional bits */
    }
  if ( rat > cdata->ratio )
    {
    cdata->ratio = rat;
    }
  else
    {
    cdata->ratio = 0;
#ifdef DEBUG
    if(cdata->verbose)
      {
      dump_tab();  /* dump string table */
      }
#endif
    cl_hash (cdata, (count_int) cdata->hsize );
    cdata->free_ent = FIRST;
    cdata->clear_flg = 1;
    if ( !output (cdata, (code_int) CLEAR ) )
      {
      return 0;
      }
#ifdef DEBUG
    if(cdata->debug)
      {
      fprintf ( stderr, "clear\n" );
      }
#endif /* DEBUG */
    }
  return 1;
}


int cmcompress_compress(struct cmcompress_stream* cdata, void* buff, size_t n)
{
  register code_int i;
  register int c;
  register int disp;

  unsigned char* input_buffer = (unsigned char*)buff;

  size_t cc;

  /*printf("cmcompress_compress(%p, %p, %d)\n", cdata, buff, n);*/

  if ( cdata->first_pass )
    {
    cdata->ent = input_buffer[0];
    ++ input_buffer;
    -- n;
    cdata->first_pass = 0;
    }

  for ( cc = 0; cc < n; ++ cc )
    {
    c = input_buffer[cc];
    cdata->in_count++;
    cdata->fcode = (long) (((long) c << cdata->maxbits) + cdata->ent);
    i = ((c << cdata->hshift) ^ cdata->ent);  /* xor hashing */

    if ( htabof (i) == cdata->fcode )
      {
      cdata->ent = codetabof (i);
      continue;
      }
    else if ( (long)htabof (i) < 0 )  /* empty slot */
      {
      goto nomatch;
      }
    disp = cdata->hsize_reg - i;    /* secondary hash (after G. Knott) */
    if ( i == 0 )
      {
      disp = 1;
      }
probe:
    if ( (i -= disp) < 0 )
      {
      i += cdata->hsize_reg;
      }

    if ( htabof (i) == cdata->fcode )
      {
      cdata->ent = codetabof (i);
      continue;
      }
    if ( (long)htabof (i) > 0 )
      {
      goto probe;
      }
nomatch:
    if ( !output(cdata, (code_int) cdata->ent ) )
      {
      return 0;
      }
    cdata->out_count++;
    cdata->ent = c;
    if (
#ifdef SIGNED_COMPARE_SLOW
      (unsigned) cdata->free_ent < (unsigned) cdata->maxmaxcode
#else
      cdata->free_ent < cdata->maxmaxcode
#endif
    )
      {
      codetabof (i) = (unsigned short)(cdata->free_ent++);  /* code -> hashtable */
      htabof (i) = cdata->fcode;
      }
    else if ( (count_int)cdata->in_count >= cdata->checkpoint && cdata->block_compress )
      {
      if ( !cl_block (cdata) )
        {
        return 0;
        }
      }
    }

  return 1;
}

int cmcompress_compress_finalize(struct cmcompress_stream* cdata)
{
  /*
   * Put out the final code.
   */
  if ( !output(cdata, (code_int)cdata->ent ) )
    {
    return 0;
    }
  cdata->out_count++;
  if ( !output(cdata, (code_int)-1 ) )
    {
    return 0;
    }

  if(cdata->bytes_out > cdata->in_count)  /* exit(2) if no savings */
    {
    return 0;
    }
  return 1;
}


#if defined(DEBUG)
static void prratio(FILE *stream, long int num, long int den)
{
  register int q;      /* Doesn't need to be long */

  if(num > 214748L)
    {    /* 2147483647/10000 */
    q = num / (den / 10000L);
    }
  else
    {
    q = 10000L * num / den;    /* Long calculations, though */
    }
  if (q < 0)
    {
    putc('-', stream);
    q = -q;
    }
  fprintf(stream, "%d.%02d%%", q / 100, q % 100);
}
#endif