varnish-cache/lib/libvgz/crc32.c
0
/* crc32.c -- compute the CRC-32 of a data stream
1
 * Copyright (C) 1995-2022 Mark Adler
2
 * For conditions of distribution and use, see copyright notice in zlib.h
3
 *
4
 * This interleaved implementation of a CRC makes use of pipelined multiple
5
 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
6
 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
7
 */
8
9
/* @(#) $Id$ */
10
11
/*
12
  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
13
  protection on the static variables used to control the first-use generation
14
  of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
15
  first call get_crc_table() to initialize the tables before allowing more than
16
  one thread to use crc32().
17
18
  MAKECRCH can be #defined to write out crc32.h. A main() routine is also
19
  produced, so that this one source file can be compiled to an executable.
20
 */
21
22
#ifdef MAKECRCH
23
#  include <stdio.h>
24
#  ifndef DYNAMIC_CRC_TABLE
25
#    define DYNAMIC_CRC_TABLE
26
#  endif /* !DYNAMIC_CRC_TABLE */
27
#endif /* MAKECRCH */
28
29
#include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
30
31
 /*
32
  A CRC of a message is computed on N braids of words in the message, where
33
  each word consists of W bytes (4 or 8). If N is 3, for example, then three
34
  running sparse CRCs are calculated respectively on each braid, at these
35
  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
36
  This is done starting at a word boundary, and continues until as many blocks
37
  of N * W bytes as are available have been processed. The results are combined
38
  into a single CRC at the end. For this code, N must be in the range 1..6 and
39
  W must be 4 or 8. The upper limit on N can be increased if desired by adding
40
  more #if blocks, extending the patterns apparent in the code. In addition,
41
  crc32.h would need to be regenerated, if the maximum N value is increased.
42
43
  N and W are chosen empirically by benchmarking the execution time on a given
44
  processor. The choices for N and W below were based on testing on Intel Kaby
45
  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
46
  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
47
  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
48
  They were all tested with either gcc or clang, all using the -O3 optimization
49
  level. Your mileage may vary.
50
 */
51
52
/* Define N */
53
#ifdef Z_TESTN
54
#  define N Z_TESTN
55
#else
56
#  define N 5
57
#endif
58
#if N < 1 || N > 6
59
#  error N must be in 1..6
60
#endif
61
62
/*
63
  z_crc_t must be at least 32 bits. z_word_t must be at least as long as
64
  z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
65
  that bytes are eight bits.
66
 */
67
68
/*
69
  Define W and the associated z_word_t type. If W is not defined, then a
70
  braided calculation is not used, and the associated tables and code are not
71
  compiled.
72
 */
73
#ifdef Z_TESTW
74
#  if Z_TESTW-1 != -1
75
#    define W Z_TESTW
76
#  endif
77
#else
78
#  ifdef MAKECRCH
79
#    define W 8         /* required for MAKECRCH */
80
#  else
81
#    if defined(__x86_64__) || defined(__aarch64__)
82
#      define W 8
83
#    else
84
#      define W 4
85
#    endif
86
#  endif
87
#endif
88
#ifdef W
89
#  if W == 8 && defined(Z_U8)
90
     typedef Z_U8 z_word_t;
91
#  elif defined(Z_U4)
92
#    undef W
93
#    define W 4
94
     typedef Z_U4 z_word_t;
95
#  else
96
#    undef W
97
#  endif
98
#endif
99
100
/* If available, use the ARM processor CRC32 instruction. */
101
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
102
#  define ARMCRC32
103
#endif
104
105
#if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
106
/*
107
  Swap the bytes in a z_word_t to convert between little and big endian. Any
108
  self-respecting compiler will optimize this to a single machine byte-swap
109
  instruction, if one is available. This assumes that word_t is either 32 bits
110
  or 64 bits.
111
 */
112 0
local z_word_t byte_swap (z_word_t word) {
113
#  if W == 8
114 0
    return
115 0
        (word & 0xff00000000000000) >> 56 |
116 0
        (word & 0xff000000000000) >> 40 |
117 0
        (word & 0xff0000000000) >> 24 |
118 0
        (word & 0xff00000000) >> 8 |
119 0
        (word & 0xff000000) << 8 |
120 0
        (word & 0xff0000) << 24 |
121 0
        (word & 0xff00) << 40 |
122 0
        (word & 0xff) << 56;
123
#  else   /* W == 4 */
124
    return
125
        (word & 0xff000000) >> 24 |
126
        (word & 0xff0000) >> 8 |
127
        (word & 0xff00) << 8 |
128
        (word & 0xff) << 24;
129
#  endif
130
}
131
#endif
132
133
#ifdef DYNAMIC_CRC_TABLE
134
/* =========================================================================
135
 * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
136
 * below.
137
 */
138
   local z_crc_t FAR x2n_table[32];
139
#else
140
/* =========================================================================
141
 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
142
 * of x for combining CRC-32s, all made by make_crc_table().
143
 */
144
#  include "crc32.h"
145
#endif
146
147
/* CRC polynomial. */
148
#define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
149
150
/*
151
  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
152
  reflected. For speed, this requires that a not be zero.
153
 */
154 65225
local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
155
    z_crc_t m, p;
156
157 65225
    m = (z_crc_t)1 << 31;
158 65225
    p = 0;
159 1653313
    for (;;) {
160 1653313
        if (a & m) {
161 670447
            p ^= b;
162 670447
            if ((a & (m - 1)) == 0)
163 65225
                break;
164 605222
        }
165 1588088
        m >>= 1;
166 1588088
        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
167
    }
168 65225
    return p;
169
}
170
171
/*
172
  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
173
  initialized.
174
 */
175 15961
local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
176
    z_crc_t p;
177
178 15961
    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
179 154106
    while (n) {
180 138145
        if (n & 1)
181 49263
            p = multmodp(x2n_table[k & 31], p);
182 138145
        n >>= 1;
183 138145
        k++;
184
    }
185 15961
    return p;
186
}
187
188
#ifdef DYNAMIC_CRC_TABLE
189
/* =========================================================================
190
 * Build the tables for byte-wise and braided CRC-32 calculations, and a table
191
 * of powers of x for combining CRC-32s.
192
 */
193
local z_crc_t FAR crc_table[256];
194
#ifdef W
195
   local z_word_t FAR crc_big_table[256];
196
   local z_crc_t FAR crc_braid_table[W][256];
197
   local z_word_t FAR crc_braid_big_table[W][256];
198
   local void braid (z_crc_t [][256], z_word_t [][256], int, int);
199
#endif
200
#ifdef MAKECRCH
201
   local void write_table (FILE *, const z_crc_t FAR *, int);
202
   local void write_table32hi (FILE *, const z_word_t FAR *, int);
203
   local void write_table64 (FILE *, const z_word_t FAR *, int);
204
#endif /* MAKECRCH */
205
206
/*
207
  Define a once() function depending on the availability of atomics. If this is
208
  compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
209
  multiple threads, and if atomics are not available, then get_crc_table() must
210
  be called to initialize the tables and must return before any threads are
211
  allowed to compute or combine CRCs.
212
 */
213
214
/* Definition of once functionality. */
215
typedef struct once_s once_t;
216
217
/* Check for the availability of atomics. */
218
#if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
219
    !defined(__STDC_NO_ATOMICS__)
220
221
#include <stdatomic.h>
222
223
/* Structure for once(), which must be initialized with ONCE_INIT. */
224
struct once_s {
225
    atomic_flag begun;
226
    atomic_int done;
227
};
228
#define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
229
230
/*
231
  Run the provided init() function exactly once, even if multiple threads
232
  invoke once() at the same time. The state must be a once_t initialized with
233
  ONCE_INIT.
234
 */
235
local void once(once_t *state, void (*init)(void)) {
236
    if (!atomic_load(&state->done)) {
237
        if (atomic_flag_test_and_set(&state->begun))
238
            while (!atomic_load(&state->done))
239
                ;
240
        else {
241
            init();
242
            atomic_store(&state->done, 1);
243
        }
244
    }
245
}
246
247
#else   /* no atomics */
248
249
/* Structure for once(), which must be initialized with ONCE_INIT. */
250
struct once_s {
251
    volatile int begun;
252
    volatile int done;
253
};
254
#define ONCE_INIT {0, 0}
255
256
/* Test and set. Alas, not atomic, but tries to minimize the period of
257
   vulnerability. */
258
local int test_and_set (int volatile *flag) {
259
    int was;
260
261
    was = *flag;
262
    *flag = 1;
263
    return was;
264
}
265
266
/* Run the provided init() function once. This is not thread-safe. */
267
local void once(once_t *state, void (*init)(void)) {
268
    if (!state->done) {
269
        if (test_and_set(&state->begun))
270
            while (!state->done)
271
                ;
272
        else {
273
            init();
274
            state->done = 1;
275
        }
276
    }
277
}
278
279
#endif
280
281
/* State for once(). */
282
local once_t made = ONCE_INIT;
283
284
/*
285
  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
286
  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
287
288
  Polynomials over GF(2) are represented in binary, one bit per coefficient,
289
  with the lowest powers in the most significant bit. Then adding polynomials
290
  is just exclusive-or, and multiplying a polynomial by x is a right shift by
291
  one. If we call the above polynomial p, and represent a byte as the
292
  polynomial q, also with the lowest power in the most significant bit (so the
293
  byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
294
  where a mod b means the remainder after dividing a by b.
295
296
  This calculation is done using the shift-register method of multiplying and
297
  taking the remainder. The register is initialized to zero, and for each
298
  incoming bit, x^32 is added mod p to the register if the bit is a one (where
299
  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
300
  (which is shifting right by one and adding x^32 mod p if the bit shifted out
301
  is a one). We start with the highest power (least significant bit) of q and
302
  repeat for all eight bits of q.
303
304
  The table is simply the CRC of all possible eight bit values. This is all the
305
  information needed to generate CRCs on data a byte at a time for all
306
  combinations of CRC register values and incoming bytes.
307
 */
308
309
local void make_crc_table(void) {
310
    unsigned i, j, n;
311
    z_crc_t p;
312
313
    /* initialize the CRC of bytes tables */
314
    for (i = 0; i < 256; i++) {
315
        p = i;
316
        for (j = 0; j < 8; j++)
317
            p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
318
        crc_table[i] = p;
319
#ifdef W
320
        crc_big_table[i] = byte_swap(p);
321
#endif
322
    }
323
324
    /* initialize the x^2^n mod p(x) table */
325
    p = (z_crc_t)1 << 30;         /* x^1 */
326
    x2n_table[0] = p;
327
    for (n = 1; n < 32; n++)
328
        x2n_table[n] = p = multmodp(p, p);
329
330
#ifdef W
331
    /* initialize the braiding tables -- needs x2n_table[] */
332
    braid(crc_braid_table, crc_braid_big_table, N, W);
333
#endif
334
335
#ifdef MAKECRCH
336
    {
337
        /*
338
          The crc32.h header file contains tables for both 32-bit and 64-bit
339
          z_word_t's, and so requires a 64-bit type be available. In that case,
340
          z_word_t must be defined to be 64-bits. This code then also generates
341
          and writes out the tables for the case that z_word_t is 32 bits.
342
         */
343
#if !defined(W) || W != 8
344
#  error Need a 64-bit integer type in order to generate crc32.h.
345
#endif
346
        FILE *out;
347
        int k, n;
348
        z_crc_t ltl[8][256];
349
        z_word_t big[8][256];
350
351
        out = fopen("crc32.h", "w");
352
        if (out == NULL) return;
353
354
        /* write out little-endian CRC table to crc32.h */
355
        fprintf(out,
356
            "/* crc32.h -- tables for rapid CRC calculation\n"
357
            " * Generated automatically by crc32.c\n */\n"
358
            "\n"
359
            "local const z_crc_t FAR crc_table[] = {\n"
360
            "    ");
361
        write_table(out, crc_table, 256);
362
        fprintf(out,
363
            "};\n");
364
365
        /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
366
        fprintf(out,
367
            "\n"
368
            "#ifdef W\n"
369
            "\n"
370
            "#if W == 8\n"
371
            "\n"
372
            "local const z_word_t FAR crc_big_table[] = {\n"
373
            "    ");
374
        write_table64(out, crc_big_table, 256);
375
        fprintf(out,
376
            "};\n");
377
378
        /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
379
        fprintf(out,
380
            "\n"
381
            "#else /* W == 4 */\n"
382
            "\n"
383
            "local const z_word_t FAR crc_big_table[] = {\n"
384
            "    ");
385
        write_table32hi(out, crc_big_table, 256);
386
        fprintf(out,
387
            "};\n"
388
            "\n"
389
            "#endif\n");
390
391
        /* write out braid tables for each value of N */
392
        for (n = 1; n <= 6; n++) {
393
            fprintf(out,
394
            "\n"
395
            "#if N == %d\n", n);
396
397
            /* compute braid tables for this N and 64-bit word_t */
398
            braid(ltl, big, n, 8);
399
400
            /* write out braid tables for 64-bit z_word_t to crc32.h */
401
            fprintf(out,
402
            "\n"
403
            "#if W == 8\n"
404
            "\n"
405
            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
406
            for (k = 0; k < 8; k++) {
407
                fprintf(out, "   {");
408
                write_table(out, ltl[k], 256);
409
                fprintf(out, "}%s", k < 7 ? ",\n" : "");
410
            }
411
            fprintf(out,
412
            "};\n"
413
            "\n"
414
            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
415
            for (k = 0; k < 8; k++) {
416
                fprintf(out, "   {");
417
                write_table64(out, big[k], 256);
418
                fprintf(out, "}%s", k < 7 ? ",\n" : "");
419
            }
420
            fprintf(out,
421
            "};\n");
422
423
            /* compute braid tables for this N and 32-bit word_t */
424
            braid(ltl, big, n, 4);
425
426
            /* write out braid tables for 32-bit z_word_t to crc32.h */
427
            fprintf(out,
428
            "\n"
429
            "#else /* W == 4 */\n"
430
            "\n"
431
            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
432
            for (k = 0; k < 4; k++) {
433
                fprintf(out, "   {");
434
                write_table(out, ltl[k], 256);
435
                fprintf(out, "}%s", k < 3 ? ",\n" : "");
436
            }
437
            fprintf(out,
438
            "};\n"
439
            "\n"
440
            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
441
            for (k = 0; k < 4; k++) {
442
                fprintf(out, "   {");
443
                write_table32hi(out, big[k], 256);
444
                fprintf(out, "}%s", k < 3 ? ",\n" : "");
445
            }
446
            fprintf(out,
447
            "};\n"
448
            "\n"
449
            "#endif\n"
450
            "\n"
451
            "#endif\n");
452
        }
453
        fprintf(out,
454
            "\n"
455
            "#endif\n");
456
457
        /* write out zeros operator table to crc32.h */
458
        fprintf(out,
459
            "\n"
460
            "local const z_crc_t FAR x2n_table[] = {\n"
461
            "    ");
462
        write_table(out, x2n_table, 32);
463
        fprintf(out,
464
            "};\n");
465
        fclose(out);
466
    }
467
#endif /* MAKECRCH */
468
}
469
470
#ifdef MAKECRCH
471
472
/*
473
   Write the 32-bit values in table[0..k-1] to out, five per line in
474
   hexadecimal separated by commas.
475
 */
476
local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
477
    int n;
478
479
    for (n = 0; n < k; n++)
480
        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
481
                (unsigned long)(table[n]),
482
                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
483
}
484
485
/*
486
   Write the high 32-bits of each value in table[0..k-1] to out, five per line
487
   in hexadecimal separated by commas.
488
 */
489
local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
490
    int n;
491
492
    for (n = 0; n < k; n++)
493
        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
494
                (unsigned long)(table[n] >> 32),
495
                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
496
}
497
498
/*
499
  Write the 64-bit values in table[0..k-1] to out, three per line in
500
  hexadecimal separated by commas. This assumes that if there is a 64-bit
501
  type, then there is also a long long integer type, and it is at least 64
502
  bits. If not, then the type cast and format string can be adjusted
503
  accordingly.
504
 */
505
local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
506
    int n;
507
508
    for (n = 0; n < k; n++)
509
        fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
510
                (unsigned long long)(table[n]),
511
                n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
512
}
513
514
/* Actually do the deed. */
515
int main(void) {
516
    make_crc_table();
517
    return 0;
518
}
519
520
#endif /* MAKECRCH */
521
522
#ifdef W
523
/*
524
  Generate the little and big-endian braid tables for the given n and z_word_t
525
  size w. Each array must have room for w blocks of 256 elements.
526
 */
527
local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
528
    int k;
529
    z_crc_t i, p, q;
530
    for (k = 0; k < w; k++) {
531
        p = x2nmodp((n * w + 3 - k) << 3, 0);
532
        ltl[k][0] = 0;
533
        big[w - 1 - k][0] = 0;
534
        for (i = 1; i < 256; i++) {
535
            ltl[k][i] = q = multmodp(i << 24, p);
536
            big[w - 1 - k][i] = byte_swap(q);
537
        }
538
    }
539
}
540
#endif
541
542
#endif /* DYNAMIC_CRC_TABLE */
543
544
/* =========================================================================
545
 * This function can be used by asm versions of crc32(), and to force the
546
 * generation of the CRC tables in a threaded application.
547
 */
548 0
const z_crc_t FAR * ZEXPORT get_crc_table(void) {
549
#ifdef DYNAMIC_CRC_TABLE
550
    once(&made, make_crc_table);
551
#endif /* DYNAMIC_CRC_TABLE */
552 0
    return (const z_crc_t FAR *)crc_table;
553
}
554
555
/* =========================================================================
556
 * Use ARM machine instructions if available. This will compute the CRC about
557
 * ten times faster than the braided calculation. This code does not check for
558
 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
559
 * only be defined if the compilation specifies an ARM processor architecture
560
 * that has the instructions. For example, compiling with -march=armv8.1-a or
561
 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
562
 * instructions.
563
 */
564
#ifdef ARMCRC32
565
566
/*
567
   Constants empirically determined to maximize speed. These values are from
568
   measurements on a Cortex-A57. Your mileage may vary.
569
 */
570
#define Z_BATCH 3990                /* number of words in a batch */
571
#define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
572
#define Z_BATCH_MIN 800             /* fewest words in a final batch */
573
574
unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
575
                              z_size_t len) {
576
    z_crc_t val;
577
    z_word_t crc1, crc2;
578
    const z_word_t *word;
579
    z_word_t val0, val1, val2;
580
    z_size_t last, last2, i;
581
    z_size_t num;
582
583
    /* Return initial CRC, if requested. */
584
    if (buf == Z_NULL) return 0;
585
586
#ifdef DYNAMIC_CRC_TABLE
587
    once(&made, make_crc_table);
588
#endif /* DYNAMIC_CRC_TABLE */
589
590
    /* Pre-condition the CRC */
591
    crc = (~crc) & 0xffffffff;
592
593
    /* Compute the CRC up to a word boundary. */
594
    while (len && ((z_size_t)buf & 7) != 0) {
595
        len--;
596
        val = *buf++;
597
        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
598
    }
599
600
    /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
601
    word = (z_word_t const *)(intptr_t)buf;
602
    num = len >> 3;
603
    len &= 7;
604
605
    /* Do three interleaved CRCs to realize the throughput of one crc32x
606
       instruction per cycle. Each CRC is calculated on Z_BATCH words. The
607
       three CRCs are combined into a single CRC after each set of batches. */
608
    while (num >= 3 * Z_BATCH) {
609
        crc1 = 0;
610
        crc2 = 0;
611
        for (i = 0; i < Z_BATCH; i++) {
612
            val0 = word[i];
613
            val1 = word[i + Z_BATCH];
614
            val2 = word[i + 2 * Z_BATCH];
615
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
616
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
617
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
618
        }
619
        word += 3 * Z_BATCH;
620
        num -= 3 * Z_BATCH;
621
        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
622
        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
623
    }
624
625
    /* Do one last smaller batch with the remaining words, if there are enough
626
       to pay for the combination of CRCs. */
627
    last = num / 3;
628
    if (last >= Z_BATCH_MIN) {
629
        last2 = last << 1;
630
        crc1 = 0;
631
        crc2 = 0;
632
        for (i = 0; i < last; i++) {
633
            val0 = word[i];
634
            val1 = word[i + last];
635
            val2 = word[i + last2];
636
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
637
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
638
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
639
        }
640
        word += 3 * last;
641
        num -= 3 * last;
642
        val = x2nmodp(last, 6);
643
        crc = multmodp(val, crc) ^ crc1;
644
        crc = multmodp(val, crc) ^ crc2;
645
    }
646
647
    /* Compute the CRC on any remaining words. */
648
    for (i = 0; i < num; i++) {
649
        val0 = word[i];
650
        __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
651
    }
652
    word += num;
653
654
    /* Complete the CRC on any remaining bytes. */
655
    buf = (const unsigned char FAR *)word;
656
    while (len) {
657
        len--;
658
        val = *buf++;
659
        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
660
    }
661
662
    /* Return the CRC, post-conditioned. */
663
    return crc ^ 0xffffffff;
664
}
665
666
#else
667
668
#ifdef W
669
670
/*
671
  Return the CRC of the W bytes in the word_t data, taking the
672
  least-significant byte of the word as the first byte of data, without any pre
673
  or post conditioning. This is used to combine the CRCs of each braid.
674
 */
675 2211630
local z_crc_t crc_word (z_word_t data) {
676
    int k;
677 19904670
    for (k = 0; k < W; k++)
678 17693040
        data = (data >> 8) ^ crc_table[data & 0xff];
679 2211630
    return (z_crc_t)data;
680
}
681
682 0
local z_word_t crc_word_big (z_word_t data) {
683
    int k;
684 0
    for (k = 0; k < W; k++)
685 0
        data = (data << 8) ^
686 0
            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
687 0
    return data;
688
}
689
690
#endif
691
692
/* ========================================================================= */
693 2821488
unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
694
                              z_size_t len) {
695
    /* Return initial CRC, if requested. */
696 2821488
    if (buf == Z_NULL) return 0;
697
698
#ifdef DYNAMIC_CRC_TABLE
699
    once(&made, make_crc_table);
700
#endif /* DYNAMIC_CRC_TABLE */
701
702
    /* Pre-condition the CRC */
703 2724196
    crc = (~crc) & 0xffffffff;
704
705
#ifdef W
706
707
    /* If provided enough bytes, do a braided CRC calculation. */
708 2724196
    if (len >= N * W + W - 1) {
709
        z_size_t blks;
710
        z_word_t const *words;
711
        unsigned endian;
712
        int k;
713
714
        /* Compute the CRC up to a z_word_t boundary. */
715 1994164
        while (len && ((z_size_t)buf & (W - 1)) != 0) {
716 1551838
            len--;
717 1551838
            crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
718
        }
719
720
        /* Compute the CRC on as many N z_word_t blocks as are available. */
721 442326
        blks = len / (N * W);
722 442326
        len -= blks * N * W;
723 442326
        words = (z_word_t const *)(intptr_t)buf;
724
725
        /* Do endian check at execution time instead of compile time, since ARM
726
           processors can change the endianness at execution time. If the
727
           compiler knows what the endianness will be, it can optimize out the
728
           check and the unused branch. */
729 442326
        endian = 1;
730 442326
        if (*(unsigned char *)&endian) {
731
            /* Little endian. */
732
733
            z_crc_t crc0;
734
            z_word_t word0;
735
#if N > 1
736
            z_crc_t crc1;
737
            z_word_t word1;
738
#if N > 2
739
            z_crc_t crc2;
740
            z_word_t word2;
741
#if N > 3
742
            z_crc_t crc3;
743
            z_word_t word3;
744
#if N > 4
745
            z_crc_t crc4;
746
            z_word_t word4;
747
#if N > 5
748
            z_crc_t crc5;
749
            z_word_t word5;
750
#endif
751
#endif
752
#endif
753
#endif
754
#endif
755
756
            /* Initialize the CRC for each braid. */
757 442326
            crc0 = crc;
758
#if N > 1
759 442326
            crc1 = 0;
760
#if N > 2
761 442326
            crc2 = 0;
762
#if N > 3
763 442326
            crc3 = 0;
764
#if N > 4
765 442326
            crc4 = 0;
766
#if N > 5
767
            crc5 = 0;
768
#endif
769
#endif
770
#endif
771
#endif
772
#endif
773
774
            /*
775
              Process the first blks-1 blocks, computing the CRCs on each braid
776
              independently.
777
             */
778 3978888
            while (--blks) {
779
                /* Load the word for each braid into registers. */
780 3536562
                word0 = crc0 ^ words[0];
781
#if N > 1
782 3536562
                word1 = crc1 ^ words[1];
783
#if N > 2
784 3536562
                word2 = crc2 ^ words[2];
785
#if N > 3
786 3536562
                word3 = crc3 ^ words[3];
787
#if N > 4
788 3536562
                word4 = crc4 ^ words[4];
789
#if N > 5
790
                word5 = crc5 ^ words[5];
791
#endif
792
#endif
793
#endif
794
#endif
795
#endif
796 3536562
                words += N;
797
798
                /* Compute and update the CRC for each word. The loop should
799
                   get unrolled. */
800 3536562
                crc0 = crc_braid_table[0][word0 & 0xff];
801
#if N > 1
802 3536562
                crc1 = crc_braid_table[0][word1 & 0xff];
803
#if N > 2
804 3536562
                crc2 = crc_braid_table[0][word2 & 0xff];
805
#if N > 3
806 3536562
                crc3 = crc_braid_table[0][word3 & 0xff];
807
#if N > 4
808 3536562
                crc4 = crc_braid_table[0][word4 & 0xff];
809
#if N > 5
810
                crc5 = crc_braid_table[0][word5 & 0xff];
811
#endif
812
#endif
813
#endif
814
#endif
815
#endif
816 28292496
                for (k = 1; k < W; k++) {
817 24755934
                    crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
818
#if N > 1
819 24755934
                    crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
820
#if N > 2
821 24755934
                    crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
822
#if N > 3
823 24755934
                    crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
824
#if N > 4
825 24755934
                    crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
826
#if N > 5
827
                    crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
828
#endif
829
#endif
830
#endif
831
#endif
832
#endif
833 24755934
                }
834
            }
835
836
            /*
837
              Process the last block, combining the CRCs of the N braids at the
838
              same time.
839
             */
840 442326
            crc = crc_word(crc0 ^ words[0]);
841
#if N > 1
842 442326
            crc = crc_word(crc1 ^ words[1] ^ crc);
843
#if N > 2
844 442326
            crc = crc_word(crc2 ^ words[2] ^ crc);
845
#if N > 3
846 442326
            crc = crc_word(crc3 ^ words[3] ^ crc);
847
#if N > 4
848 442326
            crc = crc_word(crc4 ^ words[4] ^ crc);
849
#if N > 5
850
            crc = crc_word(crc5 ^ words[5] ^ crc);
851
#endif
852
#endif
853
#endif
854
#endif
855
#endif
856 442326
            words += N;
857 442326
        }
858
        else {
859
            /* Big endian. */
860
861
            z_word_t crc0, word0, comb;
862
#if N > 1
863
            z_word_t crc1, word1;
864
#if N > 2
865
            z_word_t crc2, word2;
866
#if N > 3
867
            z_word_t crc3, word3;
868
#if N > 4
869
            z_word_t crc4, word4;
870
#if N > 5
871
            z_word_t crc5, word5;
872
#endif
873
#endif
874
#endif
875
#endif
876
#endif
877
878
            /* Initialize the CRC for each braid. */
879 0
            crc0 = byte_swap(crc);
880
#if N > 1
881 0
            crc1 = 0;
882
#if N > 2
883 0
            crc2 = 0;
884
#if N > 3
885 0
            crc3 = 0;
886
#if N > 4
887 0
            crc4 = 0;
888
#if N > 5
889
            crc5 = 0;
890
#endif
891
#endif
892
#endif
893
#endif
894
#endif
895
896
            /*
897
              Process the first blks-1 blocks, computing the CRCs on each braid
898
              independently.
899
             */
900 0
            while (--blks) {
901
                /* Load the word for each braid into registers. */
902 0
                word0 = crc0 ^ words[0];
903
#if N > 1
904 0
                word1 = crc1 ^ words[1];
905
#if N > 2
906 0
                word2 = crc2 ^ words[2];
907
#if N > 3
908 0
                word3 = crc3 ^ words[3];
909
#if N > 4
910 0
                word4 = crc4 ^ words[4];
911
#if N > 5
912
                word5 = crc5 ^ words[5];
913
#endif
914
#endif
915
#endif
916
#endif
917
#endif
918 0
                words += N;
919
920
                /* Compute and update the CRC for each word. The loop should
921
                   get unrolled. */
922 0
                crc0 = crc_braid_big_table[0][word0 & 0xff];
923
#if N > 1
924 0
                crc1 = crc_braid_big_table[0][word1 & 0xff];
925
#if N > 2
926 0
                crc2 = crc_braid_big_table[0][word2 & 0xff];
927
#if N > 3
928 0
                crc3 = crc_braid_big_table[0][word3 & 0xff];
929
#if N > 4
930 0
                crc4 = crc_braid_big_table[0][word4 & 0xff];
931
#if N > 5
932
                crc5 = crc_braid_big_table[0][word5 & 0xff];
933
#endif
934
#endif
935
#endif
936
#endif
937
#endif
938 0
                for (k = 1; k < W; k++) {
939 0
                    crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
940
#if N > 1
941 0
                    crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
942
#if N > 2
943 0
                    crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
944
#if N > 3
945 0
                    crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
946
#if N > 4
947 0
                    crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
948
#if N > 5
949
                    crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
950
#endif
951
#endif
952
#endif
953
#endif
954
#endif
955 0
                }
956
            }
957
958
            /*
959
              Process the last block, combining the CRCs of the N braids at the
960
              same time.
961
             */
962 0
            comb = crc_word_big(crc0 ^ words[0]);
963
#if N > 1
964 0
            comb = crc_word_big(crc1 ^ words[1] ^ comb);
965
#if N > 2
966 0
            comb = crc_word_big(crc2 ^ words[2] ^ comb);
967
#if N > 3
968 0
            comb = crc_word_big(crc3 ^ words[3] ^ comb);
969
#if N > 4
970 0
            comb = crc_word_big(crc4 ^ words[4] ^ comb);
971
#if N > 5
972
            comb = crc_word_big(crc5 ^ words[5] ^ comb);
973
#endif
974
#endif
975
#endif
976
#endif
977
#endif
978 0
            words += N;
979 0
            crc = byte_swap(comb);
980
        }
981
982
        /*
983
          Update the pointer to the remaining bytes to process.
984
         */
985 442326
        buf = (unsigned char const *)words;
986 442326
    }
987
988
#endif /* W */
989
990
    /* Complete the computation of the CRC on any remaining bytes. */
991 3650200
    while (len >= 8) {
992 926004
        len -= 8;
993 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
994 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
995 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
996 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
997 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
998 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
999 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1000 926004
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1001
    }
1002 9981966
    while (len) {
1003 7257770
        len--;
1004 7257770
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1005
    }
1006
1007
    /* Return the CRC, post-conditioned. */
1008 2724196
    return crc ^ 0xffffffff;
1009 2821488
}
1010
1011
#endif
1012
1013
/* ========================================================================= */
1014 2821487
unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1015
                            uInt len) {
1016 2821487
    return crc32_z(crc, buf, len);
1017
}
1018
1019
/* ========================================================================= */
1020 15961
uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1021
#ifdef DYNAMIC_CRC_TABLE
1022
    once(&made, make_crc_table);
1023
#endif /* DYNAMIC_CRC_TABLE */
1024 15961
    return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1025
}
1026
1027
/* ========================================================================= */
1028 15961
uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1029 15961
    return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1030
}
1031
1032
/* ========================================================================= */
1033 0
uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1034
#ifdef DYNAMIC_CRC_TABLE
1035
    once(&made, make_crc_table);
1036
#endif /* DYNAMIC_CRC_TABLE */
1037 0
    return x2nmodp(len2, 3);
1038
}
1039
1040
/* ========================================================================= */
1041 0
uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1042 0
    return crc32_combine_gen64((z_off64_t)len2);
1043
}
1044
1045
/* ========================================================================= */
1046 0
uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1047 0
    return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1048
}