Implement support for Intel crc32 instruction (SSE 4.2)
This change authored by vadimskipin and submitted via: https://github.com/google/leveldb/pull/309 Changes made to support iOS builds and other architectures without support for SSE 4.2. db_bench reports original crc32 speed at: crc32c : 3.610 micros/op; 1082.0 MB/s (4K per op) with this change performance has increased to: crc32c : 0.843 micros/op; 4633.6 MB/s (4K per op) ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=148694935
This commit is contained in:
parent
95cd743e5e
commit
ea175e28f8
6
Makefile
6
Makefile
@ -412,3 +412,9 @@ $(SHARED_OUTDIR)/%.o: %.cc
|
||||
|
||||
$(SHARED_OUTDIR)/%.o: %.c
|
||||
$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
|
||||
|
||||
$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
||||
|
||||
$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
|
||||
|
@ -63,6 +63,7 @@ PLATFORM_SHARED_EXT="so"
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
|
||||
PLATFORM_SHARED_CFLAGS="-fPIC"
|
||||
PLATFORM_SHARED_VERSIONED=true
|
||||
PLATFORM_SSEFLAGS=
|
||||
|
||||
MEMCMP_FLAG=
|
||||
if [ "$CXX" = "g++" ]; then
|
||||
@ -77,6 +78,7 @@ case "$TARGET_OS" in
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
|
||||
PLATFORM_LDFLAGS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
Darwin)
|
||||
PLATFORM=OS_MACOSX
|
||||
@ -85,48 +87,56 @@ case "$TARGET_OS" in
|
||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
Linux)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
SunOS)
|
||||
PLATFORM=OS_SOLARIS
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
|
||||
PLATFORM_LIBS="-lpthread -lrt"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
FreeBSD)
|
||||
PLATFORM=OS_FREEBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
NetBSD)
|
||||
PLATFORM=OS_NETBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
|
||||
PLATFORM_LIBS="-lpthread -lgcc_s"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
OpenBSD)
|
||||
PLATFORM=OS_OPENBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
DragonFly)
|
||||
PLATFORM=OS_DRAGONFLYBSD
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||
PLATFORM_LIBS="-lpthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
;;
|
||||
OS_ANDROID_CROSSCOMPILE)
|
||||
PLATFORM=OS_ANDROID
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
||||
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
HP-UX)
|
||||
@ -134,6 +144,7 @@ case "$TARGET_OS" in
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
|
||||
PLATFORM_LDFLAGS="-pthread"
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
# man ld: +h internal_name
|
||||
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
|
||||
;;
|
||||
@ -142,6 +153,7 @@ case "$TARGET_OS" in
|
||||
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
|
||||
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
|
||||
PORT_FILE=port/port_posix.cc
|
||||
PORT_SSE_FILE=port/port_posix_sse.cc
|
||||
PLATFORM_SHARED_EXT=
|
||||
PLATFORM_SHARED_LDFLAGS=
|
||||
PLATFORM_SHARED_CFLAGS=
|
||||
@ -168,7 +180,7 @@ set +f # re-enable globbing
|
||||
|
||||
# The sources consist of the portable files, plus the platform-specific port
|
||||
# file.
|
||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
|
||||
echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
|
||||
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
|
||||
|
||||
if [ "$CROSS_COMPILE" = "true" ]; then
|
||||
@ -210,6 +222,21 @@ EOF
|
||||
fi
|
||||
|
||||
rm -f $CXXOUTPUT 2>/dev/null
|
||||
|
||||
# Test if gcc SSE 4.2 is supported
|
||||
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
PLATFORM_SSEFLAGS="-msse4.2"
|
||||
fi
|
||||
|
||||
rm -f $CXXOUTPUT 2>/dev/null
|
||||
fi
|
||||
|
||||
# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
|
||||
if [ -n "$PLATFORM_SSEFLAGS" ]; then
|
||||
PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
|
||||
fi
|
||||
|
||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
|
||||
@ -222,6 +249,7 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
|
||||
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
|
||||
|
@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
|
||||
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
|
||||
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
|
||||
|
||||
// Extend the CRC to include the first n bytes of buf.
|
||||
//
|
||||
// Returns zero if the CRC cannot be extended using acceleration, else returns
|
||||
// the newly extended CRC value (which may also be zero).
|
||||
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
|
||||
|
||||
} // namespace port
|
||||
} // namespace leveldb
|
||||
|
||||
|
@ -148,6 +148,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size);
|
||||
|
||||
} // namespace port
|
||||
} // namespace leveldb
|
||||
|
||||
|
125
port/port_posix_sse.cc
Normal file
125
port/port_posix_sse.cc
Normal file
@ -0,0 +1,125 @@
|
||||
// Copyright 2016 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A portable implementation of crc32c, optimized to handle
|
||||
// four bytes at a time.
|
||||
//
|
||||
// In a separate source file to allow this accelerated CRC32C function to be
|
||||
// compiled with the appropriate compiler flags to enable x86 SSE 4.2
|
||||
// instructions.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "port/port.h"
|
||||
|
||||
#if defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__SSE4_2__)
|
||||
#include <nmmintrin.h>
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
namespace leveldb {
|
||||
namespace port {
|
||||
|
||||
#if defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
|
||||
static inline uint32_t LE_LOAD32(const uint8_t *p) {
|
||||
// SSE is x86 only, so ensured that |p| is always little-endian.
|
||||
uint32_t word;
|
||||
memcpy(&word, p, sizeof(word));
|
||||
return word;
|
||||
}
|
||||
|
||||
// Used to fetch a naturally-aligned 64-bit word in little endian byte-order
|
||||
static inline uint64_t LE_LOAD64(const uint8_t *p) {
|
||||
uint64_t dword;
|
||||
memcpy(&dword, p, sizeof(dword));
|
||||
return dword;
|
||||
}
|
||||
|
||||
static inline bool HaveSSE42() {
|
||||
#if defined(_MSC_VER)
|
||||
int cpu_info[4];
|
||||
__cpuid(cpu_info, 1);
|
||||
return (cpu_info[2] & (1 << 20)) != 0;
|
||||
#elif defined(__GNUC__)
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
return (ecx & (1 << 20)) != 0;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
|
||||
// For further improvements see Intel publication at:
|
||||
// http://download.intel.com/design/intarch/papers/323405.pdf
|
||||
uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) {
|
||||
#if !defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
return 0;
|
||||
#else
|
||||
static bool have = HaveSSE42();
|
||||
if (!have) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
|
||||
const uint8_t *e = p + size;
|
||||
uint32_t l = crc ^ 0xffffffffu;
|
||||
|
||||
#define STEP1 do { \
|
||||
l = _mm_crc32_u8(l, *p++); \
|
||||
} while (0)
|
||||
#define STEP4 do { \
|
||||
l = _mm_crc32_u32(l, LE_LOAD32(p)); \
|
||||
p += 4; \
|
||||
} while (0)
|
||||
#define STEP8 do { \
|
||||
l = _mm_crc32_u64(l, LE_LOAD64(p)); \
|
||||
p += 8; \
|
||||
} while (0)
|
||||
|
||||
if (size > 16) {
|
||||
// Process unaligned bytes
|
||||
for (unsigned int i = reinterpret_cast<uintptr_t>(p) % 8; i; --i) {
|
||||
STEP1;
|
||||
}
|
||||
|
||||
// _mm_crc32_u64 is only available on x64.
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
// Process 8 bytes at a time
|
||||
while ((e-p) >= 8) {
|
||||
STEP8;
|
||||
}
|
||||
// Process 4 bytes at a time
|
||||
if ((e-p) >= 4) {
|
||||
STEP4;
|
||||
}
|
||||
#else // !(defined(_M_X64) || defined(__x86_64__))
|
||||
// Process 4 bytes at a time
|
||||
while ((e-p) >= 4) {
|
||||
STEP4;
|
||||
}
|
||||
#endif // defined(_M_X64) || defined(__x86_64__)
|
||||
}
|
||||
// Process the last few bytes
|
||||
while (p != e) {
|
||||
STEP1;
|
||||
}
|
||||
#undef STEP8
|
||||
#undef STEP4
|
||||
#undef STEP1
|
||||
return l ^ 0xffffffffu;
|
||||
#endif // defined(LEVELDB_PLATFORM_POSIX_SSE)
|
||||
}
|
||||
|
||||
} // namespace port
|
||||
} // namespace leveldb
|
@ -8,6 +8,8 @@
|
||||
#include "util/crc32c.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "port/port.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace leveldb {
|
||||
@ -283,7 +285,23 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
|
||||
return DecodeFixed32(reinterpret_cast<const char*>(p));
|
||||
}
|
||||
|
||||
// Determine if the CPU running this program can accelerate the CRC32C
|
||||
// calculation.
|
||||
static bool CanAccelerateCRC32C() {
|
||||
// port::AcceleretedCRC32C returns zero when unable to accelerate.
|
||||
static const char kTestCRCBuffer[] = "TestCRCBuffer";
|
||||
static const char kBufSize = sizeof(kTestCRCBuffer) - 1;
|
||||
static const uint32_t kTestCRCValue = 0xdcbc59fa;
|
||||
|
||||
return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue;
|
||||
}
|
||||
|
||||
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
||||
static bool accelerate = CanAccelerateCRC32C();
|
||||
if (accelerate) {
|
||||
return port::AcceleratedCRC32C(crc, buf, size);
|
||||
}
|
||||
|
||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
|
||||
const uint8_t *e = p + size;
|
||||
uint32_t l = crc ^ 0xffffffffu;
|
||||
|
Loading…
Reference in New Issue
Block a user