Clean up util/coding.{h,cc}.

1) Inline EncodeFixed{32,64}(). They emit single machine instructions on 64-bit processors. 2) Remove size narrowing compiler warnings from DecodeFixed{32,64}(). 3) Add comments explaining the current state of optimizations in compilers we care about. 4) Change C-style includes, like <stdint.h>, to C++ style, like <cstdint>. 5) memcpy -> std::memcpy. The optimization comments are based on https://godbolt.org/z/RdIqS1. The missed optimization opportunities in clang have been reported as https://bugs.llvm.org/show_bug.cgi?id=41761 The change does not have significant impact on benchmarks. Results below. LevelDB: version 1.22 Date: Mon May 6 10:42:18 2019 CPU: 72 * Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz CPUCache: 25344 KB Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) With change ------------------------------------------------ fillseq : 2.327 micros/op; 47.5 MB/s fillsync : 4185.526 micros/op; 0.0 MB/s (1000 ops) fillrandom : 3.662 micros/op; 30.2 MB/s overwrite : 4.261 micros/op; 26.0 MB/s readrandom : 4.239 micros/op; (1000000 of 1000000 found) readrandom : 3.649 micros/op; (1000000 of 1000000 found) readseq : 0.174 micros/op; 636.7 MB/s readreverse : 0.271 micros/op; 408.7 MB/s compact : 570495.000 micros/op; readrandom : 2.735 micros/op; (1000000 of 1000000 found) readseq : 0.118 micros/op; 937.3 MB/s readreverse : 0.190 micros/op; 583.7 MB/s fill100K : 860.164 micros/op; 110.9 MB/s (1000 ops) crc32c : 1.131 micros/op; 3455.2 MB/s (4K per op) snappycomp : 3.034 micros/op; 1287.5 MB/s (output: 55.1%) snappyuncomp : 0.544 micros/op; 7176.0 MB/s Baseline ------------------------------------------------ fillseq : 2.365 micros/op; 46.8 MB/s fillsync : 4240.165 micros/op; 0.0 MB/s (1000 ops) fillrandom : 3.244 micros/op; 34.1 MB/s overwrite : 4.153 micros/op; 26.6 MB/s readrandom : 4.698 micros/op; (1000000 of 1000000 found) readrandom : 4.065 micros/op; (1000000 of 1000000 found) readseq : 0.192 micros/op; 576.3 MB/s readreverse : 0.286 micros/op; 386.7 MB/s compact : 635979.000 micros/op; readrandom : 3.264 micros/op; (1000000 of 1000000 found) readseq : 0.169 micros/op; 652.8 MB/s readreverse : 0.213 micros/op; 519.5 MB/s fill100K : 1055.367 micros/op; 90.4 MB/s (1000 ops) crc32c : 1.353 micros/op; 2887.3 MB/s (4K per op) snappycomp : 3.036 micros/op; 1286.7 MB/s (output: 55.1%) snappyuncomp : 0.540 micros/op; 7238.6 MB/s PiperOrigin-RevId: 246856811
2019-05-06 10:58:38 -07:00 · 2019-05-06 10:58:38 -07:00 · a7528a5d2b
commit a7528a5d2b
parent 142035edd4
2 changed files with 81 additions and 47 deletions
--- a/util/coding.cc
+++ b/util/coding.cc
@ -6,32 +6,6 @@

 namespace leveldb {

-void EncodeFixed32(char* dst, uint32_t value) {
-  if (port::kLittleEndian) {
-    memcpy(dst, &value, sizeof(value));
-  } else {
-    dst[0] = value & 0xff;
-    dst[1] = (value >> 8) & 0xff;
-    dst[2] = (value >> 16) & 0xff;
-    dst[3] = (value >> 24) & 0xff;
-  }
-}
-
-void EncodeFixed64(char* dst, uint64_t value) {
-  if (port::kLittleEndian) {
-    memcpy(dst, &value, sizeof(value));
-  } else {
-    dst[0] = value & 0xff;
-    dst[1] = (value >> 8) & 0xff;
-    dst[2] = (value >> 16) & 0xff;
-    dst[3] = (value >> 24) & 0xff;
-    dst[4] = (value >> 32) & 0xff;
-    dst[5] = (value >> 40) & 0xff;
-    dst[6] = (value >> 48) & 0xff;
-    dst[7] = (value >> 56) & 0xff;
-  }
-}
-
 void PutFixed32(std::string* dst, uint32_t value) {
  char buf[sizeof(value)];
  EncodeFixed32(buf, value);
--- a/util/coding.h
+++ b/util/coding.h
@ -10,9 +10,8 @@
 #ifndef STORAGE_LEVELDB_UTIL_CODING_H_
 #define STORAGE_LEVELDB_UTIL_CODING_H_

-#include <stdint.h>
-#include <string.h>
-
+#include <cstdint>
+#include <cstring>
 #include <string>

 #include "leveldb/slice.h"
@ -43,45 +42,106 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* v);
 // Returns the length of the varint32 or varint64 encoding of "v"
 int VarintLength(uint64_t v);

-// Lower-level versions of Put... that write directly into a character buffer
-// REQUIRES: dst has enough space for the value being written
-void EncodeFixed32(char* dst, uint32_t value);
-void EncodeFixed64(char* dst, uint64_t value);
-
 // Lower-level versions of Put... that write directly into a character buffer
 // and return a pointer just past the last byte written.
 // REQUIRES: dst has enough space for the value being written
 char* EncodeVarint32(char* dst, uint32_t value);
 char* EncodeVarint64(char* dst, uint64_t value);

+// TODO(costan): Remove port::kLittleEndian and the fast paths based on
+//               std::memcpy when clang learns to optimize the generic code, as
+//               described in https://bugs.llvm.org/show_bug.cgi?id=41761
+//
+// The platform-independent code in DecodeFixed{32,64}() gets optimized to mov
+// on x86 and ldr on ARM64, by both clang and gcc. However, only gcc optimizes
+// the platform-independent code in EncodeFixed{32,64}() to mov / str.
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+
+inline void EncodeFixed32(char* dst, uint32_t value) {
+  uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+  if (port::kLittleEndian) {
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / str (ARM) instruction.
+    std::memcpy(buffer, &value, sizeof(uint32_t));
+    return;
+  }
+
+  // Platform-independent code.
+  // Currently, only gcc optimizes this to a single mov / str instruction.
+  buffer[0] = static_cast<uint8_t>(value);
+  buffer[1] = static_cast<uint8_t>(value >> 8);
+  buffer[2] = static_cast<uint8_t>(value >> 16);
+  buffer[3] = static_cast<uint8_t>(value >> 24);
+}
+
+inline void EncodeFixed64(char* dst, uint64_t value) {
+  uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
+
+  if (port::kLittleEndian) {
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / str (ARM) instruction.
+    std::memcpy(buffer, &value, sizeof(uint64_t));
+    return;
+  }
+
+  // Platform-independent code.
+  // Currently, only gcc optimizes this to a single mov / str instruction.
+  buffer[0] = static_cast<uint8_t>(value);
+  buffer[1] = static_cast<uint8_t>(value >> 8);
+  buffer[2] = static_cast<uint8_t>(value >> 16);
+  buffer[3] = static_cast<uint8_t>(value >> 24);
+  buffer[4] = static_cast<uint8_t>(value >> 32);
+  buffer[5] = static_cast<uint8_t>(value >> 40);
+  buffer[6] = static_cast<uint8_t>(value >> 48);
+  buffer[7] = static_cast<uint8_t>(value >> 56);
+}
+
 // Lower-level versions of Get... that read directly from a character buffer
 // without any bounds checking.

 inline uint32_t DecodeFixed32(const char* ptr) {
+  const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
  if (port::kLittleEndian) {
-    // Load the raw bytes
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / ldr (ARM) instruction.
    uint32_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    std::memcpy(&result, buffer, sizeof(uint32_t));
    return result;
-  } else {
-    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
-            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
-            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
-            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
  }
+
+  // Platform-independent code.
+  // Clang and gcc optimize this to a single mov / ldr instruction.
+  return (static_cast<uint32_t>(buffer[0])) |
+         (static_cast<uint32_t>(buffer[1]) << 8) |
+         (static_cast<uint32_t>(buffer[2]) << 16) |
+         (static_cast<uint32_t>(buffer[3]) << 24);
 }

 inline uint64_t DecodeFixed64(const char* ptr) {
+  const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
+
  if (port::kLittleEndian) {
-    // Load the raw bytes
+    // Fast path for little-endian CPUs. All major compilers optimize this to a
+    // single mov (x86_64) / ldr (ARM) instruction.
    uint64_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    std::memcpy(&result, buffer, sizeof(uint64_t));
    return result;
-  } else {
-    uint64_t lo = DecodeFixed32(ptr);
-    uint64_t hi = DecodeFixed32(ptr + 4);
-    return (hi << 32) | lo;
  }
+
+  // Platform-independent code.
+  // Clang and gcc optimize this to a single mov / ldr instruction.
+  return (static_cast<uint64_t>(buffer[0])) |
+         (static_cast<uint64_t>(buffer[1]) << 8) |
+         (static_cast<uint64_t>(buffer[2]) << 16) |
+         (static_cast<uint64_t>(buffer[3]) << 24) |
+         (static_cast<uint64_t>(buffer[4]) << 32) |
+         (static_cast<uint64_t>(buffer[5]) << 40) |
+         (static_cast<uint64_t>(buffer[6]) << 48) |
+         (static_cast<uint64_t>(buffer[7]) << 56);
 }

 // Internal routine for use by fallback path of GetVarint32Ptr