varnish-cache/lib/libvgz/crc32.c
0
/* crc32.c -- compute the CRC-32 of a data stream
1
 * Copyright (C) 1995-2022 Mark Adler
2
 * For conditions of distribution and use, see copyright notice in zlib.h
3
 *
4
 * This interleaved implementation of a CRC makes use of pipelined multiple
5
 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
6
 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
7
 */
8
9
/* @(#) $Id$ */
10
11
/*
12
  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
13
  protection on the static variables used to control the first-use generation
14
  of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
15
  first call get_crc_table() to initialize the tables before allowing more than
16
  one thread to use crc32().
17
18
  MAKECRCH can be #defined to write out crc32.h. A main() routine is also
19
  produced, so that this one source file can be compiled to an executable.
20
 */
21
22
#ifdef MAKECRCH
23
#  include <stdio.h>
24
#  ifndef DYNAMIC_CRC_TABLE
25
#    define DYNAMIC_CRC_TABLE
26
#  endif /* !DYNAMIC_CRC_TABLE */
27
#endif /* MAKECRCH */
28
29
#include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
30
31
 /*
32
  A CRC of a message is computed on N braids of words in the message, where
33
  each word consists of W bytes (4 or 8). If N is 3, for example, then three
34
  running sparse CRCs are calculated respectively on each braid, at these
35
  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
36
  This is done starting at a word boundary, and continues until as many blocks
37
  of N * W bytes as are available have been processed. The results are combined
38
  into a single CRC at the end. For this code, N must be in the range 1..6 and
39
  W must be 4 or 8. The upper limit on N can be increased if desired by adding
40
  more #if blocks, extending the patterns apparent in the code. In addition,
41
  crc32.h would need to be regenerated, if the maximum N value is increased.
42
43
  N and W are chosen empirically by benchmarking the execution time on a given
44
  processor. The choices for N and W below were based on testing on Intel Kaby
45
  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
46
  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
47
  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
48
  They were all tested with either gcc or clang, all using the -O3 optimization
49
  level. Your mileage may vary.
50
 */
51
52
/* Define N */
53
#ifdef Z_TESTN
54
#  define N Z_TESTN
55
#else
56
#  define N 5
57
#endif
58
#if N < 1 || N > 6
59
#  error N must be in 1..6
60
#endif
61
62
/*
63
  z_crc_t must be at least 32 bits. z_word_t must be at least as long as
64
  z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
65
  that bytes are eight bits.
66
 */
67
68
/*
69
  Define W and the associated z_word_t type. If W is not defined, then a
70
  braided calculation is not used, and the associated tables and code are not
71
  compiled.
72
 */
73
#ifdef Z_TESTW
74
#  if Z_TESTW-1 != -1
75
#    define W Z_TESTW
76
#  endif
77
#else
78
#  ifdef MAKECRCH
79
#    define W 8         /* required for MAKECRCH */
80
#  else
81
#    if defined(__x86_64__) || defined(__aarch64__)
82
#      define W 8
83
#    else
84
#      define W 4
85
#    endif
86
#  endif
87
#endif
88
#ifdef W
89
#  if W == 8 && defined(Z_U8)
90
     typedef Z_U8 z_word_t;
91
#  elif defined(Z_U4)
92
#    undef W
93
#    define W 4
94
     typedef Z_U4 z_word_t;
95
#  else
96
#    undef W
97
#  endif
98
#endif
99
100
/* Local functions. */
101
local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
102
local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
103
104
/* If available, use the ARM processor CRC32 instruction. */
105
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
106
#  define ARMCRC32
107
#endif
108
109
#if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
110
/*
111
  Swap the bytes in a z_word_t to convert between little and big endian. Any
112
  self-respecting compiler will optimize this to a single machine byte-swap
113
  instruction, if one is available. This assumes that word_t is either 32 bits
114
  or 64 bits.
115
 */
116
local z_word_t byte_swap OF((z_word_t word));
117
118 0
local z_word_t byte_swap(word)
119
    z_word_t word;
120
{
121
#  if W == 8
122 0
    return
123 0
        (word & 0xff00000000000000) >> 56 |
124 0
        (word & 0xff000000000000) >> 40 |
125 0
        (word & 0xff0000000000) >> 24 |
126 0
        (word & 0xff00000000) >> 8 |
127 0
        (word & 0xff000000) << 8 |
128 0
        (word & 0xff0000) << 24 |
129 0
        (word & 0xff00) << 40 |
130 0
        (word & 0xff) << 56;
131
#  else   /* W == 4 */
132
    return
133
        (word & 0xff000000) >> 24 |
134
        (word & 0xff0000) >> 8 |
135
        (word & 0xff00) << 8 |
136
        (word & 0xff) << 24;
137
#  endif
138
}
139
#endif
140
141
/* CRC polynomial. */
142
#define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
143
144
#ifdef DYNAMIC_CRC_TABLE
145
146
local z_crc_t FAR crc_table[256];
147
local z_crc_t FAR x2n_table[32];
148
local void make_crc_table OF((void));
149
#ifdef W
150
   local z_word_t FAR crc_big_table[256];
151
   local z_crc_t FAR crc_braid_table[W][256];
152
   local z_word_t FAR crc_braid_big_table[W][256];
153
   local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
154
#endif
155
#ifdef MAKECRCH
156
   local void write_table OF((FILE *, const z_crc_t FAR *, int));
157
   local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
158
   local void write_table64 OF((FILE *, const z_word_t FAR *, int));
159
#endif /* MAKECRCH */
160
161
/*
162
  Define a once() function depending on the availability of atomics. If this is
163
  compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
164
  multiple threads, and if atomics are not available, then get_crc_table() must
165
  be called to initialize the tables and must return before any threads are
166
  allowed to compute or combine CRCs.
167
 */
168
169
/* Definition of once functionality. */
170
typedef struct once_s once_t;
171
local void once OF((once_t *, void (*)(void)));
172
173
/* Check for the availability of atomics. */
174
#if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
175
    !defined(__STDC_NO_ATOMICS__)
176
177
#include <stdatomic.h>
178
179
/* Structure for once(), which must be initialized with ONCE_INIT. */
180
struct once_s {
181
    atomic_flag begun;
182
    atomic_int done;
183
};
184
#define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
185
186
/*
187
  Run the provided init() function exactly once, even if multiple threads
188
  invoke once() at the same time. The state must be a once_t initialized with
189
  ONCE_INIT.
190
 */
191
local void once(state, init)
192
    once_t *state;
193
    void (*init)(void);
194
{
195
    if (!atomic_load(&state->done)) {
196
        if (atomic_flag_test_and_set(&state->begun))
197
            while (!atomic_load(&state->done))
198
                ;
199
        else {
200
            init();
201
            atomic_store(&state->done, 1);
202
        }
203
    }
204
}
205
206
#else   /* no atomics */
207
208
/* Structure for once(), which must be initialized with ONCE_INIT. */
209
struct once_s {
210
    volatile int begun;
211
    volatile int done;
212
};
213
#define ONCE_INIT {0, 0}
214
215
/* Test and set. Alas, not atomic, but tries to minimize the period of
216
   vulnerability. */
217
local int test_and_set OF((int volatile *));
218
local int test_and_set(flag)
219
    int volatile *flag;
220
{
221
    int was;
222
223
    was = *flag;
224
    *flag = 1;
225
    return was;
226
}
227
228
/* Run the provided init() function once. This is not thread-safe. */
229
local void once(state, init)
230
    once_t *state;
231
    void (*init)(void);
232
{
233
    if (!state->done) {
234
        if (test_and_set(&state->begun))
235
            while (!state->done)
236
                ;
237
        else {
238
            init();
239
            state->done = 1;
240
        }
241
    }
242
}
243
244
#endif
245
246
/* State for once(). */
247
local once_t made = ONCE_INIT;
248
249
/*
250
  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
251
  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
252
253
  Polynomials over GF(2) are represented in binary, one bit per coefficient,
254
  with the lowest powers in the most significant bit. Then adding polynomials
255
  is just exclusive-or, and multiplying a polynomial by x is a right shift by
256
  one. If we call the above polynomial p, and represent a byte as the
257
  polynomial q, also with the lowest power in the most significant bit (so the
258
  byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
259
  where a mod b means the remainder after dividing a by b.
260
261
  This calculation is done using the shift-register method of multiplying and
262
  taking the remainder. The register is initialized to zero, and for each
263
  incoming bit, x^32 is added mod p to the register if the bit is a one (where
264
  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
265
  (which is shifting right by one and adding x^32 mod p if the bit shifted out
266
  is a one). We start with the highest power (least significant bit) of q and
267
  repeat for all eight bits of q.
268
269
  The table is simply the CRC of all possible eight bit values. This is all the
270
  information needed to generate CRCs on data a byte at a time for all
271
  combinations of CRC register values and incoming bytes.
272
 */
273
274
local void make_crc_table()
275
{
276
    unsigned i, j, n;
277
    z_crc_t p;
278
279
    /* initialize the CRC of bytes tables */
280
    for (i = 0; i < 256; i++) {
281
        p = i;
282
        for (j = 0; j < 8; j++)
283
            p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
284
        crc_table[i] = p;
285
#ifdef W
286
        crc_big_table[i] = byte_swap(p);
287
#endif
288
    }
289
290
    /* initialize the x^2^n mod p(x) table */
291
    p = (z_crc_t)1 << 30;         /* x^1 */
292
    x2n_table[0] = p;
293
    for (n = 1; n < 32; n++)
294
        x2n_table[n] = p = multmodp(p, p);
295
296
#ifdef W
297
    /* initialize the braiding tables -- needs x2n_table[] */
298
    braid(crc_braid_table, crc_braid_big_table, N, W);
299
#endif
300
301
#ifdef MAKECRCH
302
    {
303
        /*
304
          The crc32.h header file contains tables for both 32-bit and 64-bit
305
          z_word_t's, and so requires a 64-bit type be available. In that case,
306
          z_word_t must be defined to be 64-bits. This code then also generates
307
          and writes out the tables for the case that z_word_t is 32 bits.
308
         */
309
#if !defined(W) || W != 8
310
#  error Need a 64-bit integer type in order to generate crc32.h.
311
#endif
312
        FILE *out;
313
        int k, n;
314
        z_crc_t ltl[8][256];
315
        z_word_t big[8][256];
316
317
        out = fopen("crc32.h", "w");
318
        if (out == NULL) return;
319
320
        /* write out little-endian CRC table to crc32.h */
321
        fprintf(out,
322
            "/* crc32.h -- tables for rapid CRC calculation\n"
323
            " * Generated automatically by crc32.c\n */\n"
324
            "\n"
325
            "local const z_crc_t FAR crc_table[] = {\n"
326
            "    ");
327
        write_table(out, crc_table, 256);
328
        fprintf(out,
329
            "};\n");
330
331
        /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
332
        fprintf(out,
333
            "\n"
334
            "#ifdef W\n"
335
            "\n"
336
            "#if W == 8\n"
337
            "\n"
338
            "local const z_word_t FAR crc_big_table[] = {\n"
339
            "    ");
340
        write_table64(out, crc_big_table, 256);
341
        fprintf(out,
342
            "};\n");
343
344
        /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
345
        fprintf(out,
346
            "\n"
347
            "#else /* W == 4 */\n"
348
            "\n"
349
            "local const z_word_t FAR crc_big_table[] = {\n"
350
            "    ");
351
        write_table32hi(out, crc_big_table, 256);
352
        fprintf(out,
353
            "};\n"
354
            "\n"
355
            "#endif\n");
356
357
        /* write out braid tables for each value of N */
358
        for (n = 1; n <= 6; n++) {
359
            fprintf(out,
360
            "\n"
361
            "#if N == %d\n", n);
362
363
            /* compute braid tables for this N and 64-bit word_t */
364
            braid(ltl, big, n, 8);
365
366
            /* write out braid tables for 64-bit z_word_t to crc32.h */
367
            fprintf(out,
368
            "\n"
369
            "#if W == 8\n"
370
            "\n"
371
            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
372
            for (k = 0; k < 8; k++) {
373
                fprintf(out, "   {");
374
                write_table(out, ltl[k], 256);
375
                fprintf(out, "}%s", k < 7 ? ",\n" : "");
376
            }
377
            fprintf(out,
378
            "};\n"
379
            "\n"
380
            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
381
            for (k = 0; k < 8; k++) {
382
                fprintf(out, "   {");
383
                write_table64(out, big[k], 256);
384
                fprintf(out, "}%s", k < 7 ? ",\n" : "");
385
            }
386
            fprintf(out,
387
            "};\n");
388
389
            /* compute braid tables for this N and 32-bit word_t */
390
            braid(ltl, big, n, 4);
391
392
            /* write out braid tables for 32-bit z_word_t to crc32.h */
393
            fprintf(out,
394
            "\n"
395
            "#else /* W == 4 */\n"
396
            "\n"
397
            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
398
            for (k = 0; k < 4; k++) {
399
                fprintf(out, "   {");
400
                write_table(out, ltl[k], 256);
401
                fprintf(out, "}%s", k < 3 ? ",\n" : "");
402
            }
403
            fprintf(out,
404
            "};\n"
405
            "\n"
406
            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
407
            for (k = 0; k < 4; k++) {
408
                fprintf(out, "   {");
409
                write_table32hi(out, big[k], 256);
410
                fprintf(out, "}%s", k < 3 ? ",\n" : "");
411
            }
412
            fprintf(out,
413
            "};\n"
414
            "\n"
415
            "#endif\n"
416
            "\n"
417
            "#endif\n");
418
        }
419
        fprintf(out,
420
            "\n"
421
            "#endif\n");
422
423
        /* write out zeros operator table to crc32.h */
424
        fprintf(out,
425
            "\n"
426
            "local const z_crc_t FAR x2n_table[] = {\n"
427
            "    ");
428
        write_table(out, x2n_table, 32);
429
        fprintf(out,
430
            "};\n");
431
        fclose(out);
432
    }
433
#endif /* MAKECRCH */
434
}
435
436
#ifdef MAKECRCH
437
438
/*
439
   Write the 32-bit values in table[0..k-1] to out, five per line in
440
   hexadecimal separated by commas.
441
 */
442
local void write_table(out, table, k)
443
    FILE *out;
444
    const z_crc_t FAR *table;
445
    int k;
446
{
447
    int n;
448
449
    for (n = 0; n < k; n++)
450
        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
451
                (unsigned long)(table[n]),
452
                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
453
}
454
455
/*
456
   Write the high 32-bits of each value in table[0..k-1] to out, five per line
457
   in hexadecimal separated by commas.
458
 */
459
local void write_table32hi(out, table, k)
460
FILE *out;
461
const z_word_t FAR *table;
462
int k;
463
{
464
    int n;
465
466
    for (n = 0; n < k; n++)
467
        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
468
                (unsigned long)(table[n] >> 32),
469
                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
470
}
471
472
/*
473
  Write the 64-bit values in table[0..k-1] to out, three per line in
474
  hexadecimal separated by commas. This assumes that if there is a 64-bit
475
  type, then there is also a long long integer type, and it is at least 64
476
  bits. If not, then the type cast and format string can be adjusted
477
  accordingly.
478
 */
479
local void write_table64(out, table, k)
480
    FILE *out;
481
    const z_word_t FAR *table;
482
    int k;
483
{
484
    int n;
485
486
    for (n = 0; n < k; n++)
487
        fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
488
                (unsigned long long)(table[n]),
489
                n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
490
}
491
492
/* Actually do the deed. */
493
int main()
494
{
495
    make_crc_table();
496
    return 0;
497
}
498
499
#endif /* MAKECRCH */
500
501
#ifdef W
502
/*
503
  Generate the little and big-endian braid tables for the given n and z_word_t
504
  size w. Each array must have room for w blocks of 256 elements.
505
 */
506
local void braid(ltl, big, n, w)
507
    z_crc_t ltl[][256];
508
    z_word_t big[][256];
509
    int n;
510
    int w;
511
{
512
    int k;
513
    z_crc_t i, p, q;
514
    for (k = 0; k < w; k++) {
515
        p = x2nmodp((n * w + 3 - k) << 3, 0);
516
        ltl[k][0] = 0;
517
        big[w - 1 - k][0] = 0;
518
        for (i = 1; i < 256; i++) {
519
            ltl[k][i] = q = multmodp(i << 24, p);
520
            big[w - 1 - k][i] = byte_swap(q);
521
        }
522
    }
523
}
524
#endif
525
526
#else /* !DYNAMIC_CRC_TABLE */
527
/* ========================================================================
528
 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
529
 * of x for combining CRC-32s, all made by make_crc_table().
530
 */
531
#include "crc32.h"
532
#endif /* DYNAMIC_CRC_TABLE */
533
534
/* ========================================================================
535
 * Routines used for CRC calculation. Some are also required for the table
536
 * generation above.
537
 */
538
539
/*
540
  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
541
  reflected. For speed, this requires that a not be zero.
542
 */
543 1494
local z_crc_t multmodp(a, b)
544
    z_crc_t a;
545
    z_crc_t b;
546
{
547
    z_crc_t m, p;
548
549 1494
    m = (z_crc_t)1 << 31;
550 1494
    p = 0;
551 35497
    for (;;) {
552 35497
        if (a & m) {
553 13935
            p ^= b;
554 13935
            if ((a & (m - 1)) == 0)
555 1494
                break;
556 12441
        }
557 34003
        m >>= 1;
558 34003
        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
559
    }
560 1494
    return p;
561
}
562
563
/*
564
  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
565
  initialized.
566
 */
567 441
local z_crc_t x2nmodp(n, k)
568
    z_off64_t n;
569
    unsigned k;
570
{
571
    z_crc_t p;
572
573 441
    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
574 2076
    while (n) {
575 1635
        if (n & 1)
576 1053
            p = multmodp(x2n_table[k & 31], p);
577 1635
        n >>= 1;
578 1635
        k++;
579
    }
580 441
    return p;
581
}
582
583
/* =========================================================================
584
 * This function can be used by asm versions of crc32(), and to force the
585
 * generation of the CRC tables in a threaded application.
586
 */
587 0
const z_crc_t FAR * ZEXPORT get_crc_table()
588
{
589
#ifdef DYNAMIC_CRC_TABLE
590
    once(&made, make_crc_table);
591
#endif /* DYNAMIC_CRC_TABLE */
592 0
    return (const z_crc_t FAR *)crc_table;
593
}
594
595
/* =========================================================================
596
 * Use ARM machine instructions if available. This will compute the CRC about
597
 * ten times faster than the braided calculation. This code does not check for
598
 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
599
 * only be defined if the compilation specifies an ARM processor architecture
600
 * that has the instructions. For example, compiling with -march=armv8.1-a or
601
 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
602
 * instructions.
603
 */
604
#ifdef ARMCRC32
605
606
/*
607
   Constants empirically determined to maximize speed. These values are from
608
   measurements on a Cortex-A57. Your mileage may vary.
609
 */
610
#define Z_BATCH 3990                /* number of words in a batch */
611
#define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
612
#define Z_BATCH_MIN 800             /* fewest words in a final batch */
613
614
unsigned long ZEXPORT crc32_z(crc, buf, len)
615
    unsigned long crc;
616
    const unsigned char FAR *buf;
617
    z_size_t len;
618
{
619
    z_crc_t val;
620
    z_word_t crc1, crc2;
621
    const z_word_t *word;
622
    z_word_t val0, val1, val2;
623
    z_size_t last, last2, i;
624
    z_size_t num;
625
626
    /* Return initial CRC, if requested. */
627
    if (buf == Z_NULL) return 0;
628
629
#ifdef DYNAMIC_CRC_TABLE
630
    once(&made, make_crc_table);
631
#endif /* DYNAMIC_CRC_TABLE */
632
633
    /* Pre-condition the CRC */
634
    crc = (~crc) & 0xffffffff;
635
636
    /* Compute the CRC up to a word boundary. */
637
    while (len && ((z_size_t)buf & 7) != 0) {
638
        len--;
639
        val = *buf++;
640
        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
641
    }
642
643
    /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
644
    word = (z_word_t const *)(intptr_t)buf;
645
    num = len >> 3;
646
    len &= 7;
647
648
    /* Do three interleaved CRCs to realize the throughput of one crc32x
649
       instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
650
       CRCs are combined into a single CRC after each set of batches. */
651
    while (num >= 3 * Z_BATCH) {
652
        crc1 = 0;
653
        crc2 = 0;
654
        for (i = 0; i < Z_BATCH; i++) {
655
            val0 = word[i];
656
            val1 = word[i + Z_BATCH];
657
            val2 = word[i + 2 * Z_BATCH];
658
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
659
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
660
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
661
        }
662
        word += 3 * Z_BATCH;
663
        num -= 3 * Z_BATCH;
664
        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
665
        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
666
    }
667
668
    /* Do one last smaller batch with the remaining words, if there are enough
669
       to pay for the combination of CRCs. */
670
    last = num / 3;
671
    if (last >= Z_BATCH_MIN) {
672
        last2 = last << 1;
673
        crc1 = 0;
674
        crc2 = 0;
675
        for (i = 0; i < last; i++) {
676
            val0 = word[i];
677
            val1 = word[i + last];
678
            val2 = word[i + last2];
679
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
680
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
681
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
682
        }
683
        word += 3 * last;
684
        num -= 3 * last;
685
        val = x2nmodp(last, 6);
686
        crc = multmodp(val, crc) ^ crc1;
687
        crc = multmodp(val, crc) ^ crc2;
688
    }
689
690
    /* Compute the CRC on any remaining words. */
691
    for (i = 0; i < num; i++) {
692
        val0 = word[i];
693
        __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
694
    }
695
    word += num;
696
697
    /* Complete the CRC on any remaining bytes. */
698
    buf = (const unsigned char FAR *)word;
699
    while (len) {
700
        len--;
701
        val = *buf++;
702
        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
703
    }
704
705
    /* Return the CRC, post-conditioned. */
706
    return crc ^ 0xffffffff;
707
}
708
709
#else
710
711
#ifdef W
712
713
/*
714
  Return the CRC of the W bytes in the word_t data, taking the
715
  least-significant byte of the word as the first byte of data, without any pre
716
  or post conditioning. This is used to combine the CRCs of each braid.
717
 */
718
local z_crc_t crc_word OF((z_word_t data));
719
720 165670
local z_crc_t crc_word(data)
721
    z_word_t data;
722
{
723
    int k;
724 1491030
    for (k = 0; k < W; k++)
725 1325360
        data = (data >> 8) ^ crc_table[data & 0xff];
726 165670
    return (z_crc_t)data;
727
}
728
729
local z_word_t crc_word_big OF((z_word_t data));
730
731 0
local z_word_t crc_word_big(data)
732
    z_word_t data;
733
{
734
    int k;
735 0
    for (k = 0; k < W; k++)
736 0
        data = (data << 8) ^
737 0
            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
738 0
    return data;
739
}
740
741
#endif
742
743
/* ========================================================================= */
744 211053
unsigned long ZEXPORT crc32_z(crc, buf, len)
745
    unsigned long crc;
746
    const unsigned char FAR *buf;
747
    z_size_t len;
748
{
749
    /* Return initial CRC, if requested. */
750 211053
    if (buf == Z_NULL) return 0;
751
752
#ifdef DYNAMIC_CRC_TABLE
753
    once(&made, make_crc_table);
754
#endif /* DYNAMIC_CRC_TABLE */
755
756
    /* Pre-condition the CRC */
757 204116
    crc = (~crc) & 0xffffffff;
758
759
#ifdef W
760
761
    /* If provided enough bytes, do a braided CRC calculation. */
762 204116
    if (len >= N * W + W - 1) {
763
        z_size_t blks;
764
        z_word_t const *words;
765
        unsigned endian;
766
        int k;
767
768
        /* Compute the CRC up to a z_word_t boundary. */
769 149417
        while (len && ((z_size_t)buf & (W - 1)) != 0) {
770 116283
            len--;
771 116283
            crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
772
        }
773
774
        /* Compute the CRC on as many N z_word_t blocks as are available. */
775 33134
        blks = len / (N * W);
776 33134
        len -= blks * N * W;
777 33134
        words = (z_word_t const *)(intptr_t)buf;
778
779
        /* Do endian check at execution time instead of compile time, since ARM
780
           processors can change the endianess at execution time. If the
781
           compiler knows what the endianess will be, it can optimize out the
782
           check and the unused branch. */
783 33134
        endian = 1;
784 33134
        if (*(unsigned char *)&endian) {
785
            /* Little endian. */
786
787
            z_crc_t crc0;
788
            z_word_t word0;
789
#if N > 1
790
            z_crc_t crc1;
791
            z_word_t word1;
792
#if N > 2
793
            z_crc_t crc2;
794
            z_word_t word2;
795
#if N > 3
796
            z_crc_t crc3;
797
            z_word_t word3;
798
#if N > 4
799
            z_crc_t crc4;
800
            z_word_t word4;
801
#if N > 5
802
            z_crc_t crc5;
803
            z_word_t word5;
804
#endif
805
#endif
806
#endif
807
#endif
808
#endif
809
810
            /* Initialize the CRC for each braid. */
811 33134
            crc0 = crc;
812
#if N > 1
813 33134
            crc1 = 0;
814
#if N > 2
815 33134
            crc2 = 0;
816
#if N > 3
817 33134
            crc3 = 0;
818
#if N > 4
819 33134
            crc4 = 0;
820
#if N > 5
821
            crc5 = 0;
822
#endif
823
#endif
824
#endif
825
#endif
826
#endif
827
828
            /*
829
              Process the first blks-1 blocks, computing the CRCs on each braid
830
              independently.
831
             */
832 297988
            while (--blks) {
833
                /* Load the word for each braid into registers. */
834 264854
                word0 = crc0 ^ words[0];
835
#if N > 1
836 264854
                word1 = crc1 ^ words[1];
837
#if N > 2
838 264854
                word2 = crc2 ^ words[2];
839
#if N > 3
840 264854
                word3 = crc3 ^ words[3];
841
#if N > 4
842 264854
                word4 = crc4 ^ words[4];
843
#if N > 5
844
                word5 = crc5 ^ words[5];
845
#endif
846
#endif
847
#endif
848
#endif
849
#endif
850 264854
                words += N;
851
852
                /* Compute and update the CRC for each word. The loop should
853
                   get unrolled. */
854 264854
                crc0 = crc_braid_table[0][word0 & 0xff];
855
#if N > 1
856 264854
                crc1 = crc_braid_table[0][word1 & 0xff];
857
#if N > 2
858 264854
                crc2 = crc_braid_table[0][word2 & 0xff];
859
#if N > 3
860 264854
                crc3 = crc_braid_table[0][word3 & 0xff];
861
#if N > 4
862 264854
                crc4 = crc_braid_table[0][word4 & 0xff];
863
#if N > 5
864
                crc5 = crc_braid_table[0][word5 & 0xff];
865
#endif
866
#endif
867
#endif
868
#endif
869
#endif
870 2118832
                for (k = 1; k < W; k++) {
871 1853978
                    crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
872
#if N > 1
873 1853978
                    crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
874
#if N > 2
875 1853978
                    crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
876
#if N > 3
877 1853978
                    crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
878
#if N > 4
879 1853978
                    crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
880
#if N > 5
881
                    crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
882
#endif
883
#endif
884
#endif
885
#endif
886
#endif
887 1853978
                }
888
            }
889
890
            /*
891
              Process the last block, combining the CRCs of the N braids at the
892
              same time.
893
             */
894 33134
            crc = crc_word(crc0 ^ words[0]);
895
#if N > 1
896 33134
            crc = crc_word(crc1 ^ words[1] ^ crc);
897
#if N > 2
898 33134
            crc = crc_word(crc2 ^ words[2] ^ crc);
899
#if N > 3
900 33134
            crc = crc_word(crc3 ^ words[3] ^ crc);
901
#if N > 4
902 33134
            crc = crc_word(crc4 ^ words[4] ^ crc);
903
#if N > 5
904
            crc = crc_word(crc5 ^ words[5] ^ crc);
905
#endif
906
#endif
907
#endif
908
#endif
909
#endif
910 33134
            words += N;
911 33134
        }
912
        else {
913
            /* Big endian. */
914
915
            z_word_t crc0, word0, comb;
916
#if N > 1
917
            z_word_t crc1, word1;
918
#if N > 2
919
            z_word_t crc2, word2;
920
#if N > 3
921
            z_word_t crc3, word3;
922
#if N > 4
923
            z_word_t crc4, word4;
924
#if N > 5
925
            z_word_t crc5, word5;
926
#endif
927
#endif
928
#endif
929
#endif
930
#endif
931
932
            /* Initialize the CRC for each braid. */
933 0
            crc0 = byte_swap(crc);
934
#if N > 1
935 0
            crc1 = 0;
936
#if N > 2
937 0
            crc2 = 0;
938
#if N > 3
939 0
            crc3 = 0;
940
#if N > 4
941 0
            crc4 = 0;
942
#if N > 5
943
            crc5 = 0;
944
#endif
945
#endif
946
#endif
947
#endif
948
#endif
949
950
            /*
951
              Process the first blks-1 blocks, computing the CRCs on each braid
952
              independently.
953
             */
954 0
            while (--blks) {
955
                /* Load the word for each braid into registers. */
956 0
                word0 = crc0 ^ words[0];
957
#if N > 1
958 0
                word1 = crc1 ^ words[1];
959
#if N > 2
960 0
                word2 = crc2 ^ words[2];
961
#if N > 3
962 0
                word3 = crc3 ^ words[3];
963
#if N > 4
964 0
                word4 = crc4 ^ words[4];
965
#if N > 5
966
                word5 = crc5 ^ words[5];
967
#endif
968
#endif
969
#endif
970
#endif
971
#endif
972 0
                words += N;
973
974
                /* Compute and update the CRC for each word. The loop should
975
                   get unrolled. */
976 0
                crc0 = crc_braid_big_table[0][word0 & 0xff];
977
#if N > 1
978 0
                crc1 = crc_braid_big_table[0][word1 & 0xff];
979
#if N > 2
980 0
                crc2 = crc_braid_big_table[0][word2 & 0xff];
981
#if N > 3
982 0
                crc3 = crc_braid_big_table[0][word3 & 0xff];
983
#if N > 4
984 0
                crc4 = crc_braid_big_table[0][word4 & 0xff];
985
#if N > 5
986
                crc5 = crc_braid_big_table[0][word5 & 0xff];
987
#endif
988
#endif
989
#endif
990
#endif
991
#endif
992 0
                for (k = 1; k < W; k++) {
993 0
                    crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
994
#if N > 1
995 0
                    crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
996
#if N > 2
997 0
                    crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
998
#if N > 3
999 0
                    crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1000
#if N > 4
1001 0
                    crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1002
#if N > 5
1003
                    crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1004
#endif
1005
#endif
1006
#endif
1007
#endif
1008
#endif
1009 0
                }
1010
            }
1011
1012
            /*
1013
              Process the last block, combining the CRCs of the N braids at the
1014
              same time.
1015
             */
1016 0
            comb = crc_word_big(crc0 ^ words[0]);
1017
#if N > 1
1018 0
            comb = crc_word_big(crc1 ^ words[1] ^ comb);
1019
#if N > 2
1020 0
            comb = crc_word_big(crc2 ^ words[2] ^ comb);
1021
#if N > 3
1022 0
            comb = crc_word_big(crc3 ^ words[3] ^ comb);
1023
#if N > 4
1024 0
            comb = crc_word_big(crc4 ^ words[4] ^ comb);
1025
#if N > 5
1026
            comb = crc_word_big(crc5 ^ words[5] ^ comb);
1027
#endif
1028
#endif
1029
#endif
1030
#endif
1031
#endif
1032 0
            words += N;
1033 0
            crc = byte_swap(comb);
1034
        }
1035
1036
        /*
1037
          Update the pointer to the remaining bytes to process.
1038
         */
1039 33134
        buf = (unsigned char const *)words;
1040 33134
    }
1041
1042
#endif /* W */
1043
1044
    /* Complete the computation of the CRC on any remaining bytes. */
1045 273362
    while (len >= 8) {
1046 69246
        len -= 8;
1047 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1048 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1049 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1050 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1051 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1052 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1053 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054 69246
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055
    }
1056 747418
    while (len) {
1057 543302
        len--;
1058 543302
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1059
    }
1060
1061
    /* Return the CRC, post-conditioned. */
1062 204116
    return crc ^ 0xffffffff;
1063 211053
}
1064
1065
#endif
1066
1067
/* ========================================================================= */
1068 211053
unsigned long ZEXPORT crc32(crc, buf, len)
1069
    unsigned long crc;
1070
    const unsigned char FAR *buf;
1071
    uInt len;
1072
{
1073 211053
    return crc32_z(crc, buf, len);
1074
}
1075
1076
/* ========================================================================= */
1077 441
uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1078
    uLong crc1;
1079
    uLong crc2;
1080
    z_off64_t len2;
1081
{
1082
#ifdef DYNAMIC_CRC_TABLE
1083
    once(&made, make_crc_table);
1084
#endif /* DYNAMIC_CRC_TABLE */
1085 441
    return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1086
}
1087
1088
/* ========================================================================= */
1089 441
uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1090
    uLong crc1;
1091
    uLong crc2;
1092
    z_off_t len2;
1093
{
1094 441
    return crc32_combine64(crc1, crc2, len2);
1095
}
1096
1097
/* ========================================================================= */
1098 0
uLong ZEXPORT crc32_combine_gen64(len2)
1099
    z_off64_t len2;
1100
{
1101
#ifdef DYNAMIC_CRC_TABLE
1102
    once(&made, make_crc_table);
1103
#endif /* DYNAMIC_CRC_TABLE */
1104 0
    return x2nmodp(len2, 3);
1105
}
1106
1107
/* ========================================================================= */
1108 0
uLong ZEXPORT crc32_combine_gen(len2)
1109
    z_off_t len2;
1110
{
1111 0
    return crc32_combine_gen64(len2);
1112
}
1113
1114
/* ========================================================================= */
1115 0
uLong crc32_combine_op(crc1, crc2, op)
1116
    uLong crc1;
1117
    uLong crc2;
1118
    uLong op;
1119
{
1120 0
    return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1121
}