Skip to content
Snippets Groups Projects
Commit 088ccca0 authored by Brian Bockelman's avatar Brian Bockelman Committed by Philippe Canal
Browse files

Add checksum to the LZ4 compressed buffer format.

Simply reuses the XXHASH implementation from the LZ4 library (also
used in the LZ4 frame format).
parent 8a23d174
Branches
Tags
No related merge requests found
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#---Declare ZipLZ4 sources as part of libCore------------------------------- #---Declare ZipLZ4 sources as part of libCore-------------------------------
set(headers ${CMAKE_CURRENT_SOURCE_DIR}/inc/ZipLZ4.h) set(headers ${CMAKE_CURRENT_SOURCE_DIR}/inc/ZipLZ4.h)
set(sources ${CMAKE_CURRENT_SOURCE_DIR}/src/ZipLZ4.c) set(sources ${CMAKE_CURRENT_SOURCE_DIR}/src/ZipLZ4.cxx)
include_directories(${LZ4_INCLUDE_DIR}) include_directories(${LZ4_INCLUDE_DIR})
......
...@@ -8,6 +8,14 @@ ...@@ -8,6 +8,14 @@
* For the list of contributors see $ROOTSYS/README/CREDITS. * * For the list of contributors see $ROOTSYS/README/CREDITS. *
*************************************************************************/ *************************************************************************/
void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, int *irep); // NOTE: the ROOT compression libraries aren't consistently written in C++; hence the
// #ifdef's to avoid problems with C code.
#ifdef __cplusplus
extern "C" {
#endif
void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, int *irep);
void R__unzipLZ4(int *srcsize, unsigned char *src, int *tgtsize, unsigned char *tgt, int *irep);
#ifdef __cplusplus
}
#endif
void R__unzipLZ4(int *srcsize, unsigned char *src, int *tgtsize, unsigned char *tgt, int *irep);
...@@ -12,11 +12,29 @@ ...@@ -12,11 +12,29 @@
#include "lz4.h" #include "lz4.h"
#include "lz4hc.h" #include "lz4hc.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <cinttypes>
#include <cstdint>
#include <cstring>
#include "RConfig.h" #include "RConfig.h"
static const int kHeaderSize = 9; // Pulled from liblz4; upstream library explicitly exposes the symbol but the default build
// excludes the header.
typedef unsigned long long XXH64_hash_t;
typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
extern "C" XXH64_hash_t LZ4_XXH64(const void* input, size_t length, unsigned long long seed);
extern "C" void LZ4_XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
extern "C" XXH64_hash_t LZ4_XXH64_hashFromCanonical(const XXH64_canonical_t* src);
// Header consists of:
// - 2 byte identifier "L4"
// - 1 byte LZ4 version string.
// - 3 bytes of uncompressed size
// - 3 bytes of compressed size
// - 8 byte checksum using xxhash 64.
static const int kChecksumOffset = 2 + 1 + 3 + 3;
static const int kChecksumSize = sizeof(XXH64_canonical_t);
static const int kHeaderSize = kChecksumOffset + kChecksumSize;
void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, int *irep) void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, int *irep)
{ {
...@@ -26,11 +44,12 @@ void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, in ...@@ -26,11 +44,12 @@ void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, in
*irep = 0; *irep = 0;
if (*tgtsize <= 0) { if (R__unlikely(*tgtsize <= 0)) {
return; return;
} }
if (*srcsize > 0xffffff || *srcsize < 0) { // Refuse to compress more than 16MB at a time -- we are only allowed 3 bytes for size info.
if (R__unlikely(*srcsize > 0xffffff || *srcsize < 0)) {
return; return;
} }
...@@ -47,13 +66,16 @@ void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, in ...@@ -47,13 +66,16 @@ void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, in
if (R__unlikely(returnStatus == 0)) { /* LZ4 compression failed */ if (R__unlikely(returnStatus == 0)) { /* LZ4 compression failed */
return; return;
} }
XXH64_hash_t checksumResult = LZ4_XXH64(tgt + kHeaderSize, returnStatus, 0);
tgt[0] = 'L'; tgt[0] = 'L';
tgt[1] = '4'; tgt[1] = '4';
tgt[2] = (LZ4_version / (100 * 100)); tgt[2] = (LZ4_version / (100 * 100));
out_size = returnStatus; /* compressed size */ out_size = returnStatus + kChecksumSize; /* compressed size, including the checksum. */
// NOTE: these next 6 bytes are required from the ROOT compressed buffer format;
// upper layers will assume they are laid out in a specific manner.
tgt[3] = (char)(out_size & 0xff); tgt[3] = (char)(out_size & 0xff);
tgt[4] = (char)((out_size >> 8) & 0xff); tgt[4] = (char)((out_size >> 8) & 0xff);
tgt[5] = (char)((out_size >> 16) & 0xff); tgt[5] = (char)((out_size >> 16) & 0xff);
...@@ -62,11 +84,17 @@ void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, in ...@@ -62,11 +84,17 @@ void R__zipLZ4(int cxlevel, int *srcsize, char *src, int *tgtsize, char *tgt, in
tgt[7] = (char)((in_size >> 8) & 0xff); tgt[7] = (char)((in_size >> 8) & 0xff);
tgt[8] = (char)((in_size >> 16) & 0xff); tgt[8] = (char)((in_size >> 16) & 0xff);
// Write out checksum.
LZ4_XXH64_canonicalFromHash(reinterpret_cast<XXH64_canonical_t*>(tgt + kChecksumOffset), checksumResult);
*irep = (int)returnStatus + kHeaderSize; *irep = (int)returnStatus + kHeaderSize;
} }
void R__unzipLZ4(int *srcsize, unsigned char *src, int *tgtsize, unsigned char *tgt, int *irep) void R__unzipLZ4(int *srcsize, unsigned char *src, int *tgtsize, unsigned char *tgt, int *irep)
{ {
// NOTE: We don't check that srcsize / tgtsize is reasonable or within the ROOT-imposed limits.
// This is assumed to be handled by the upper layers.
int LZ4_version = LZ4_versionNumber() / (100 * 100); int LZ4_version = LZ4_versionNumber() / (100 * 100);
*irep = 0; *irep = 0;
if (R__unlikely(src[0] != 'L' || src[1] != '4')) { if (R__unlikely(src[0] != 'L' || src[1] != '4')) {
...@@ -81,7 +109,22 @@ void R__unzipLZ4(int *srcsize, unsigned char *src, int *tgtsize, unsigned char * ...@@ -81,7 +109,22 @@ void R__unzipLZ4(int *srcsize, unsigned char *src, int *tgtsize, unsigned char *
return; return;
} }
int returnStatus = LZ4_decompress_safe((char *)(&src[kHeaderSize]), (char *)(tgt), *srcsize - kHeaderSize, *tgtsize); int inputBufferSize = *srcsize - kHeaderSize;
// TODO: The checksum followed by the decompression means we iterate through the buffer twice.
// We should perform some performance tests to see whether we can interleave the two -- i.e., at
// what size of chunks does interleaving (avoiding two fetches from RAM) improve enough for the
// extra function call costs? NOTE that ROOT limits the buffer size to 16MB.
XXH64_hash_t checksumResult = LZ4_XXH64(src + kHeaderSize, inputBufferSize, 0);
XXH64_hash_t checksumFromFile = LZ4_XXH64_hashFromCanonical(reinterpret_cast<XXH64_canonical_t*>(src + kChecksumOffset));
if (R__unlikely(checksumFromFile != checksumResult)) {
fprintf(stderr,
"R__unzipLZ4: Buffer corruption error! Calculated checksum %llu; checksum calculated in the file was %llu.\n",
checksumResult, checksumFromFile);
return;
}
int returnStatus = LZ4_decompress_safe((char *)(&src[kHeaderSize]), (char *)(tgt), inputBufferSize, *tgtsize);
if (R__unlikely(returnStatus < 0)) { if (R__unlikely(returnStatus < 0)) {
fprintf(stderr, "R__unzipLZ4: error in decompression around byte %d out of maximum %d.\n", -returnStatus, fprintf(stderr, "R__unzipLZ4: error in decompression around byte %d out of maximum %d.\n", -returnStatus,
*tgtsize); *tgtsize);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment