Replace SSE-optimized CRC32C in POSIX port with external library.
Maintaining a hardware-accelerated CRC32C implementation tailored for all modern platforms deserves a repository of its own. We extracted the implementation here into https://github.com/google/crc32c and improved it in that repository. This CL removes the SSE-optimized implementation from this codebase, and adds the ability to use the google/crc32c library, if it is present on the system. The benchmarks below show the performance impact of the change. In summary, open source builds that use the google/crc32c library can expect a 3x improvement in CRC32C throughput, whereas builds that do not use the library will see a 50% drop in CRC32C throughput. This translates in much smaller changes in overall leveldb performance. Baseline, MacBookPro13,3 with Core i7 6920HQ: LevelDB: version 1.20 Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) ------------------------------------------------ fillseq : 3.064 micros/op; 36.1 MB/s fillsync : 57.861 micros/op; 1.9 MB/s (1000 ops) fillrandom : 3.887 micros/op; 28.5 MB/s overwrite : 4.140 micros/op; 26.7 MB/s readrandom : 7.433 micros/op; (1000000 of 1000000 found) readrandom : 6.825 micros/op; (1000000 of 1000000 found) readseq : 0.244 micros/op; 453.4 MB/s readreverse : 0.387 micros/op; 285.8 MB/s compact : 449707.000 micros/op; readrandom : 4.196 micros/op; (1000000 of 1000000 found) readseq : 0.228 micros/op; 485.8 MB/s readreverse : 0.320 micros/op; 345.2 MB/s fill100K : 562.556 micros/op; 169.6 MB/s (1000 ops) crc32c : 0.768 micros/op; 5085.0 MB/s (4K per op) snappycomp : 4.220 micros/op; 925.7 MB/s (output: 55.1%) snappyuncomp : 0.635 micros/op; 6155.7 MB/s acquireload : 13.054 micros/op; (each op is 1000 loads) New with crc32c, MacBookPro13,3 with Core i7 6920HQ: LevelDB: version 1.20 Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) ------------------------------------------------ fillseq : 2.820 micros/op; 39.2 MB/s fillsync : 51.988 micros/op; 2.1 MB/s (1000 ops) fillrandom : 3.747 micros/op; 29.5 MB/s overwrite : 4.047 micros/op; 27.3 MB/s readrandom : 7.287 micros/op; (1000000 of 1000000 found) readrandom : 6.927 micros/op; (1000000 of 1000000 found) readseq : 0.253 micros/op; 437.5 MB/s readreverse : 0.411 micros/op; 269.2 MB/s compact : 440405.000 micros/op; readrandom : 4.159 micros/op; (1000000 of 1000000 found) readseq : 0.230 micros/op; 481.1 MB/s readreverse : 0.320 micros/op; 345.9 MB/s fill100K : 558.222 micros/op; 170.9 MB/s (1000 ops) crc32c : 0.214 micros/op; 18263.5 MB/s (4K per op) snappycomp : 4.471 micros/op; 873.7 MB/s (output: 55.1%) snappyuncomp : 0.833 micros/op; 4688.5 MB/s acquireload : 13.289 micros/op; (each op is 1000 loads) New without crc32c, MacBookPro13,3 with Core i7 6920HQ LevelDB: version 1.20 Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) ------------------------------------------------ fillseq : 3.094 micros/op; 35.8 MB/s fillsync : 52.160 micros/op; 2.1 MB/s (1000 ops) fillrandom : 4.090 micros/op; 27.0 MB/s overwrite : 4.006 micros/op; 27.6 MB/s readrandom : 6.584 micros/op; (1000000 of 1000000 found) readrandom : 6.676 micros/op; (1000000 of 1000000 found) readseq : 0.280 micros/op; 395.2 MB/s readreverse : 0.391 micros/op; 283.2 MB/s compact : 433911.000 micros/op; readrandom : 4.261 micros/op; (1000000 of 1000000 found) readseq : 0.251 micros/op; 440.5 MB/s readreverse : 0.356 micros/op; 310.9 MB/s fill100K : 584.023 micros/op; 163.3 MB/s (1000 ops) crc32c : 1.384 micros/op; 2822.3 MB/s (4K per op) snappycomp : 4.763 micros/op; 820.1 MB/s (output: 55.1%) snappyuncomp : 0.766 micros/op; 5098.6 MB/s acquireload : 12.931 micros/op; (each op is 1000 loads) ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=171667771
This commit is contained in:
parent
ca216e493f
commit
5c39524f36
8
Makefile
8
Makefile
@ -98,7 +98,7 @@ SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o))
|
||||
|
||||
TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o
|
||||
TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL)
|
||||
TEST_STATIC_OBJS := $(STATIC_OUTDIR)/port/port_posix.o $(STATIC_OUTDIR)/port/port_posix_sse.o $(STATIC_OUTDIR)/util/crc32c.o $(STATIC_OUTDIR)/util/histogram.o
|
||||
TEST_STATIC_OBJS := $(STATIC_OUTDIR)/port/port_posix.o $(STATIC_OUTDIR)/util/crc32c.o $(STATIC_OUTDIR)/util/histogram.o
|
||||
|
||||
STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS)))
|
||||
STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS)))
|
||||
@ -426,9 +426,3 @@ $(SHARED_OUTDIR)/%.o: %.cc
|
||||
|
||||
$(SHARED_OUTDIR)/%.o: %.c
|
||||
$(CC) $(CFLAGS) $(SHARED_BUILD_CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
||||
|
||||
$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
||||
$(CXX) $(CXXFLAGS) $(SHARED_BUILD_CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
||||
|
||||
$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
||||
$(CXX) $(CXXFLAGS) $(SHARED_BUILD_CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
||||
|
@ -22,6 +22,7 @@
|
||||
#
|
||||
# -DLEVELDB_ATOMIC_PRESENT if <atomic> is present
|
||||
# -DLEVELDB_PLATFORM_POSIX=1 for Posix-based platforms
|
||||
# -DHAVE_CRC32C=1 if the CRC32C library is present
|
||||
# -DHAVE_SNAPPY=1 if the Snappy library is present
|
||||
#
|
||||
|
||||
@ -63,7 +64,6 @@ PLATFORM_SHARED_EXT="so"
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
|
||||
PLATFORM_SHARED_CFLAGS="-fPIC -fvisibility=hidden"
|
||||
PLATFORM_SHARED_VERSIONED=true
|
||||
PLATFORM_SSEFLAGS=
|
||||
|
||||
MEMCMP_FLAG=
|
||||
if [ "$CXX" = "g++" ]; then
|
||||
@ -78,7 +78,6 @@ case "$TARGET_OS" in
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
|
||||
PLATFORM_LDFLAGS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
Darwin)
|
||||
PLATFORM=OS_MACOSX
|
||||
@ -87,56 +86,48 @@ case "$TARGET_OS" in
|
||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
Linux)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
SunOS)
|
||||
PLATFORM=OS_SOLARIS
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
|
||||
PLATFORM_LIBS="-lpthread -lrt"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
FreeBSD)
|
||||
PLATFORM=OS_FREEBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
NetBSD)
|
||||
PLATFORM=OS_NETBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
|
||||
PLATFORM_LIBS="-lpthread -lgcc_s"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
OpenBSD)
|
||||
PLATFORM=OS_OPENBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
DragonFly)
|
||||
PLATFORM=OS_DRAGONFLYBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
OS_ANDROID_CROSSCOMPILE)
|
||||
PLATFORM=OS_ANDROID
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX=1"
|
||||
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
HP-UX)
|
||||
@ -144,7 +135,6 @@ case "$TARGET_OS" in
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
# man ld: +h internal_name
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
|
||||
;;
|
||||
@ -153,7 +143,6 @@ case "$TARGET_OS" in
|
||||
COMMON_FLAGS="$MEMCMP_FLAG"
|
||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
PLATFORM_SHARED_EXT=
|
||||
PLATFORM_SHARED_LDFLAGS=
|
||||
PLATFORM_SHARED_CFLAGS=
|
||||
@ -180,7 +169,7 @@ set +f # re-enable globbing
|
||||
|
||||
# The sources consist of the portable files, plus the platform-specific port
|
||||
# file.
|
||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
|
||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
|
||||
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
|
||||
|
||||
if [ "$CROSS_COMPILE" = "true" ]; then
|
||||
@ -202,8 +191,20 @@ EOF
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX=1"
|
||||
fi
|
||||
|
||||
# Test whether CRC32C library is installed
|
||||
# https://github.com/google/crc32c
|
||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
|
||||
#include <crc32c/crc32c.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_CRC32C=1"
|
||||
PLATFORM_LIBS="$PLATFORM_LIBS -lcrc32c"
|
||||
fi
|
||||
|
||||
|
||||
# Test whether Snappy library is installed
|
||||
# http://code.google.com/p/snappy/
|
||||
# https://github.com/google/snappy
|
||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
|
||||
#include <snappy.h>
|
||||
int main() {}
|
||||
@ -223,22 +224,9 @@ EOF
|
||||
|
||||
rm -f $CXXOUTPUT 2>/dev/null
|
||||
|
||||
# Test if gcc SSE 4.2 is supported
|
||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
PLATFORM_SSEFLAGS="-msse4.2"
|
||||
fi
|
||||
|
||||
rm -f $CXXOUTPUT 2>/dev/null
|
||||
fi
|
||||
|
||||
# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
|
||||
if [ -n "$PLATFORM_SSEFLAGS" ]; then
|
||||
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
|
||||
fi
|
||||
|
||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
|
||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
|
||||
|
||||
@ -249,7 +237,6 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
|
||||
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
|
||||
|
@ -39,6 +39,9 @@
|
||||
#endif
|
||||
|
||||
#include <pthread.h>
|
||||
#if defined(HAVE_CRC32C)
|
||||
#include <crc32c/crc32c.h>
|
||||
#endif // defined(HAVE_CRC32C)
|
||||
#ifdef HAVE_SNAPPY
|
||||
#include <snappy.h>
|
||||
#endif // defined(HAVE_SNAPPY)
|
||||
@ -139,7 +142,13 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
|
||||
inline uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
|
||||
#if defined(HAVE_CRC32C)
|
||||
return ::crc32c::Extend(crc, reinterpret_cast<const uint8_t*>(buf), size);
|
||||
#else
|
||||
return 0;
|
||||
#endif // defined(HAVE_CRC32C)
|
||||
}
|
||||
|
||||
} // namespace port
|
||||
} // namespace leveldb
|
||||
|
@ -1,133 +0,0 @@
|
||||
// Copyright 2016 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A portable implementation of crc32c, optimized to handle
|
||||
// four bytes at a time.
|
||||
//
|
||||
// In a separate source file to allow this accelerated CRC32C function to be
|
||||
// compiled with the appropriate compiler flags to enable x86 SSE 4.2
|
||||
// instructions.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "port/port.h"
|
||||
|
||||
#if defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__SSE4_2__)
|
||||
#include <nmmintrin.h>
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
namespace leveldb {
|
||||
namespace port {
|
||||
|
||||
#if defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
|
||||
static inline uint32_t LE_LOAD32(const uint8_t *p) {
|
||||
// SSE is x86 only, so ensured that |p| is always little-endian.
|
||||
uint32_t word;
|
||||
memcpy(&word, p, sizeof(word));
|
||||
return word;
|
||||
}
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__) // LE_LOAD64 is only used on x64.
|
||||
|
||||
// Used to fetch a naturally-aligned 64-bit word in little endian byte-order
|
||||
static inline uint64_t LE_LOAD64(const uint8_t *p) {
|
||||
uint64_t dword;
|
||||
memcpy(&dword, p, sizeof(dword));
|
||||
return dword;
|
||||
}
|
||||
|
||||
#endif // defined(_M_X64) || defined(__x86_64__)
|
||||
|
||||
static inline bool HaveSSE42() {
|
||||
#if defined(_MSC_VER)
|
||||
int cpu_info[4];
|
||||
__cpuid(cpu_info, 1);
|
||||
return (cpu_info[2] & (1 << 20)) != 0;
|
||||
#elif defined(__GNUC__)
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
return (ecx & (1 << 20)) != 0;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
// For further improvements see Intel publication at:
|
||||
// http://download.intel.com/design/intarch/papers/323405.pdf
|
||||
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
|
||||
#if !defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
return 0;
|
||||
#else
|
||||
static bool have = HaveSSE42();
|
||||
if (!have) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
|
||||
const uint8_t *e = p + size;
|
||||
uint32_t l = crc ^ 0xffffffffu;
|
||||
|
||||
#define STEP1 do { \
|
||||
l = _mm_crc32_u8(l, *p++); \
|
||||
} while (0)
|
||||
#define STEP4 do { \
|
||||
l = _mm_crc32_u32(l, LE_LOAD32(p)); \
|
||||
p += 4; \
|
||||
} while (0)
|
||||
#define STEP8 do { \
|
||||
l = _mm_crc32_u64(l, LE_LOAD64(p)); \
|
||||
p += 8; \
|
||||
} while (0)
|
||||
|
||||
if (size > 16) {
|
||||
// Point x at first 8-byte aligned byte in string. This must be inside the
|
||||
// string, due to the size check above.
|
||||
const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
|
||||
const uint8_t* x = reinterpret_cast<const uint8_t*>(((pval + 7) >> 3) << 3);
|
||||
// Process bytes until p is 8-byte aligned.
|
||||
while (p != x) {
|
||||
STEP1;
|
||||
}
|
||||
|
||||
// _mm_crc32_u64 is only available on x64.
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
// Process 8 bytes at a time
|
||||
while ((e-p) >= 8) {
|
||||
STEP8;
|
||||
}
|
||||
// Process 4 bytes at a time
|
||||
if ((e-p) >= 4) {
|
||||
STEP4;
|
||||
}
|
||||
#else // !(defined(_M_X64) || defined(__x86_64__))
|
||||
// Process 4 bytes at a time
|
||||
while ((e-p) >= 4) {
|
||||
STEP4;
|
||||
}
|
||||
#endif // defined(_M_X64) || defined(__x86_64__)
|
||||
}
|
||||
// Process the last few bytes
|
||||
while (p != e) {
|
||||
STEP1;
|
||||
}
|
||||
#undef STEP8
|
||||
#undef STEP4
|
||||
#undef STEP1
|
||||
return l ^ 0xffffffffu;
|
||||
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
}
|
||||
|
||||
} // namespace port
|
||||
} // namespace leveldb
|
Loading…
Reference in New Issue
Block a user