X-Git-Url: https://git.cworth.org/git?a=blobdiff_plain;f=thirdparty%2Fsnappy%2Fsnappy.cc;h=4d4eb42a4998e819bf91078c9878e5ff95d80608;hb=af0f4f9483fc9aef4d3af914d80d56b560272dcb;hp=c79edb58a7fe50db0d35ff724d042f369fa99570;hpb=df1a1816c13e6fcdf63a45c973f09f04af318073;p=apitrace

diff --git a/thirdparty/snappy/snappy.cc b/thirdparty/snappy/snappy.cc
index c79edb5..4d4eb42 100644
--- a/thirdparty/snappy/snappy.cc
+++ b/thirdparty/snappy/snappy.cc
@@ -140,12 +140,12 @@ const int kMaxIncrementCopyOverflow = 10;
 
 static inline void IncrementalCopyFastPath(const char* src, char* op, int len) {
   while (op - src < 8) {
-    UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+    UnalignedCopy64(src, op);
     len -= op - src;
     op += op - src;
   }
   while (len > 0) {
-    UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+    UnalignedCopy64(src, op);
     src += 8;
     op += 8;
     len -= 8;
@@ -172,8 +172,8 @@ static inline char* EmitLiteral(char* op,
     //   - The output will always have 32 spare bytes (see
     //     MaxCompressedLength).
     if (allow_fast_path && len <= 16) {
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(literal + 8));
+      UnalignedCopy64(literal, op);
+      UnalignedCopy64(literal + 8, op + 8);
       return op + len;
     }
   } else {
@@ -194,13 +194,13 @@ static inline char* EmitLiteral(char* op,
   return op + len;
 }
 
-static inline char* EmitCopyLessThan64(char* op, int offset, int len) {
+static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
   DCHECK_LE(len, 64);
   DCHECK_GE(len, 4);
   DCHECK_LT(offset, 65536);
 
   if ((len < 12) && (offset < 2048)) {
-    int len_minus_4 = len - 4;
+    size_t len_minus_4 = len - 4;
     assert(len_minus_4 < 8);            // Must fit in 3 bits
     *op++ = COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8) << 5);
     *op++ = offset & 0xff;
@@ -212,7 +212,7 @@ static inline char* EmitCopyLessThan64(char* op, int offset, int len) {
   return op;
 }
 
-static inline char* EmitCopy(char* op, int offset, int len) {
+static inline char* EmitCopy(char* op, size_t offset, int len) {
   // Emit 64 byte copies but make sure to keep at least four bytes reserved
   while (len >= 68) {
     op = EmitCopyLessThan64(op, offset, 64);
@@ -249,7 +249,7 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
   // compression, and if the input is short, we won't need that
   // many hash table entries anyway.
   assert(kMaxHashTableSize >= 256);
-  int htsize = 256;
+  size_t htsize = 256;
   while (htsize < kMaxHashTableSize && htsize < input_size) {
     htsize <<= 1;
   }
@@ -272,16 +272,49 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
 }
 }  // end namespace internal
 
-// For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+// For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
 // equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
 // empirically found that overlapping loads such as
 //  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
 // are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
+//
+// We have different versions for 64- and 32-bit; ideally we would avoid the
+// two functions and just inline the UNALIGNED_LOAD64 call into
+// GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
+// enough to avoid loading the value multiple times then. For 64-bit, the load
+// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
+// done at GetUint32AtOffset() time.
+
+#ifdef ARCH_K8
+
+typedef uint64 EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+  return UNALIGNED_LOAD64(ptr);
+}
+
 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
-  DCHECK(0 <= offset && offset <= 4) << offset;
+  DCHECK_GE(offset, 0);
+  DCHECK_LE(offset, 4);
   return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
 }
 
+#else
+
+typedef const char* EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+  return ptr;
+}
+
+static inline uint32 GetUint32AtOffset(const char* v, int offset) {
+  DCHECK_GE(offset, 0);
+  DCHECK_LE(offset, 4);
+  return UNALIGNED_LOAD32(v + offset);
+}
+
+#endif
+
 // Flat array compression that does not emit the "uncompressed length"
 // prefix. Compresses "input" string to the "*op" buffer.
 //
@@ -304,14 +337,14 @@ char* CompressFragment(const char* input,
   CHECK_LE(input_size, kBlockSize);
   CHECK_EQ(table_size & (table_size - 1), 0) << ": table must be power of two";
   const int shift = 32 - Bits::Log2Floor(table_size);
-  DCHECK_EQ(kuint32max >> shift, table_size - 1);
+  DCHECK_EQ(static_cast<int>(kuint32max >> shift), table_size - 1);
   const char* ip_end = input + input_size;
   const char* base_ip = ip;
   // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
   // [next_emit, ip_end) after the main loop.
   const char* next_emit = ip;
 
-  const int kInputMarginBytes = 15;
+  const size_t kInputMarginBytes = 15;
   if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
     const char* ip_limit = input + input_size - kInputMarginBytes;
 
@@ -378,7 +411,7 @@ char* CompressFragment(const char* input,
       // though we don't yet know how big the literal will be.  We handle that
       // by proceeding to the next iteration of the main loop.  We also can exit
       // this loop via goto if we get close to exhausting the input.
-      uint64 input_bytes = 0;
+      EightBytesReference input_bytes;
       uint32 candidate_bytes = 0;
 
       do {
@@ -387,7 +420,7 @@ char* CompressFragment(const char* input,
         const char* base = ip;
         int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
         ip += matched;
-        int offset = base - candidate;
+        size_t offset = base - candidate;
         DCHECK_EQ(0, memcmp(base, candidate, matched));
         op = EmitCopy(op, offset, matched);
         // We could immediately start working at ip now, but to improve
@@ -397,7 +430,7 @@ char* CompressFragment(const char* input,
         if (PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
         }
-        input_bytes = UNALIGNED_LOAD64(insert_tail);
+        input_bytes = GetEightBytesAt(insert_tail);
         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
         table[prev_hash] = ip - base_ip - 1;
         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
@@ -435,12 +468,26 @@ char* CompressFragment(const char* input,
 //   bool CheckLength() const;
 //
 //   // Called repeatedly during decompression
-//   bool Append(const char* ip, uint32 length, bool allow_fast_path);
-//   bool AppendFromSelf(uint32 offset, uint32 length);
-// };
+//   bool Append(const char* ip, size_t length);
+//   bool AppendFromSelf(uint32 offset, size_t length);
 //
-// "allow_fast_path" is a parameter that says if there is at least 16
-// readable bytes in "ip". It is currently only used by SnappyArrayWriter.
+//   // The difference between TryFastAppend and Append is that TryFastAppend
+//   // is allowed to read up to <available> bytes from the input buffer,
+//   // whereas Append is allowed to read <length>.
+//   //
+//   // Also, TryFastAppend is allowed to return false, declining the append,
+//   // without it being a fatal error -- just "return false" would be
+//   // a perfectly legal implementation of TryFastAppend. The intention
+//   // is for TryFastAppend to allow a fast path in the common case of
+//   // a small append.
+//   //
+//   // NOTE(user): TryFastAppend must always return decline (return false)
+//   // if <length> is 61 or more, as in this case the literal length is not
+//   // decoded fully. In practice, this should not be a big problem,
+//   // as it is unlikely that one would implement a fast path accepting
+//   // this much data.
+//   bool TryFastAppend(const char* ip, size_t available, size_t length);
+// };
 
 // -----------------------------------------------------------------------
 // Lookup table for decompression code.  Generated by ComputeTable() below.
@@ -587,7 +634,6 @@ static void ComputeTable() {
     CHECK_EQ(dst[i], char_table[i]);
   }
 }
-REGISTER_MODULE_INITIALIZER(snappy, ComputeTable());
 #endif /* !NDEBUG */
 
 // Helper class for decompression
@@ -655,29 +701,41 @@ class SnappyDecompressor {
   template <class Writer>
   void DecompressAllTags(Writer* writer) {
     const char* ip = ip_;
-    for ( ;; ) {
-      if (ip_limit_ - ip < 5) {
-        ip_ = ip;
-        if (!RefillTag()) return;
-        ip = ip_;
-      }
 
+    // We could have put this refill fragment only at the beginning of the loop.
+    // However, duplicating it at the end of each branch gives the compiler more
+    // scope to optimize the <ip_limit_ - ip> expression based on the local
+    // context, which overall increases speed.
+    #define MAYBE_REFILL() \
+        if (ip_limit_ - ip < 5) { \
+          ip_ = ip; \
+          if (!RefillTag()) return; \
+          ip = ip_; \
+        }
+
+    MAYBE_REFILL();
+    for ( ;; ) {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
 
       if ((c & 0x3) == LITERAL) {
-        uint32 literal_length = c >> 2;
-        if (PREDICT_FALSE(literal_length >= 60)) {
+        size_t literal_length = (c >> 2) + 1u;
+        if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
+          DCHECK_LT(literal_length, 61);
+          ip += literal_length;
+          MAYBE_REFILL();
+          continue;
+        }
+        if (PREDICT_FALSE(literal_length >= 61)) {
           // Long literal.
-          const uint32 literal_length_length = literal_length - 59;
+          const size_t literal_length_length = literal_length - 60;
           literal_length =
-              LittleEndian::Load32(ip) & wordmask[literal_length_length];
+              (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
           ip += literal_length_length;
         }
-        ++literal_length;
 
-        uint32 avail = ip_limit_ - ip;
+        size_t avail = ip_limit_ - ip;
         while (avail < literal_length) {
-          if (!writer->Append(ip, avail, false)) return;
+          if (!writer->Append(ip, avail)) return;
           literal_length -= avail;
           reader_->Skip(peeked_);
           size_t n;
@@ -687,11 +745,11 @@ class SnappyDecompressor {
           if (avail == 0) return;  // Premature end of input
           ip_limit_ = ip + avail;
         }
-        bool allow_fast_path = (avail >= 16);
-        if (!writer->Append(ip, literal_length, allow_fast_path)) {
+        if (!writer->Append(ip, literal_length)) {
           return;
         }
         ip += literal_length;
+        MAYBE_REFILL();
       } else {
         const uint32 entry = char_table[c];
         const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
@@ -705,8 +763,11 @@ class SnappyDecompressor {
         if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
           return;
         }
+        MAYBE_REFILL();
       }
     }
+
+#undef MAYBE_REFILL
   }
 };
 
@@ -777,6 +838,15 @@ static bool InternalUncompress(Source* r,
   SnappyDecompressor decompressor(r);
   uint32 uncompressed_len = 0;
   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
+  return InternalUncompressAllTags(
+      &decompressor, writer, uncompressed_len, max_len);
+}
+
+template <typename Writer>
+static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
+                                      Writer* writer,
+                                      uint32 uncompressed_len,
+                                      uint32 max_len) {
   // Protect against possible DoS attack
   if (static_cast<uint64>(uncompressed_len) > max_len) {
     return false;
@@ -785,8 +855,8 @@ static bool InternalUncompress(Source* r,
   writer->SetExpectedLength(uncompressed_len);
 
   // Process the entire input
-  decompressor.DecompressAllTags(writer);
-  return (decompressor.eof() && writer->CheckLength());
+  decompressor->DecompressAllTags(writer);
+  return (decompressor->eof() && writer->CheckLength());
 }
 
 bool GetUncompressedLength(Source* source, uint32* result) {
@@ -796,7 +866,7 @@ bool GetUncompressedLength(Source* source, uint32* result) {
 
 size_t Compress(Source* reader, Sink* writer) {
   size_t written = 0;
-  int N = reader->Available();
+  size_t N = reader->Available();
   char ulength[Varint::kMax32];
   char* p = Varint::Encode32(ulength, N);
   writer->Append(ulength, p-ulength);
@@ -811,10 +881,10 @@ size_t Compress(Source* reader, Sink* writer) {
     size_t fragment_size;
     const char* fragment = reader->Peek(&fragment_size);
     DCHECK_NE(fragment_size, 0) << ": premature end of input";
-    const int num_to_read = min(N, kBlockSize);
+    const size_t num_to_read = min(N, kBlockSize);
     size_t bytes_read = fragment_size;
 
-    int pending_advance = 0;
+    size_t pending_advance = 0;
     if (bytes_read >= num_to_read) {
       // Buffer returned by reader is large enough
       pending_advance = num_to_read;
@@ -902,34 +972,42 @@ class SnappyArrayWriter {
     return op_ == op_limit_;
   }
 
-  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+  inline bool Append(const char* ip, size_t len) {
     char* op = op_;
-    const int space_left = op_limit_ - op;
-    if (allow_fast_path && len <= 16 && space_left >= 16) {
-      // Fast path, used for the majority (about 90%) of dynamic invocations.
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
-    } else {
-      if (space_left < len) {
-        return false;
-      }
-      memcpy(op, ip, len);
+    const size_t space_left = op_limit_ - op;
+    if (space_left < len) {
+      return false;
     }
+    memcpy(op, ip, len);
     op_ = op + len;
     return true;
   }
 
-  inline bool AppendFromSelf(uint32 offset, uint32 len) {
+  inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
     char* op = op_;
-    const int space_left = op_limit_ - op;
+    const size_t space_left = op_limit_ - op;
+    if (len <= 16 && available >= 16 && space_left >= 16) {
+      // Fast path, used for the majority (about 95%) of invocations.
+      UnalignedCopy64(ip, op);
+      UnalignedCopy64(ip + 8, op + 8);
+      op_ = op + len;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  inline bool AppendFromSelf(size_t offset, size_t len) {
+    char* op = op_;
+    const size_t space_left = op_limit_ - op;
 
     if (op - base_ <= offset - 1u) {  // -1u catches offset==0
       return false;
     }
     if (len <= 16 && offset >= 8 && space_left >= 16) {
       // Fast path, used for the majority (70-80%) of dynamic invocations.
-      UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
-      UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+      UnalignedCopy64(op - offset, op);
+      UnalignedCopy64(op - offset + 8, op + 8);
     } else {
       if (space_left >= len + kMaxIncrementCopyOverflow) {
         IncrementalCopyFastPath(op - offset, op, len);
@@ -985,11 +1063,14 @@ class SnappyDecompressionValidator {
   inline bool CheckLength() const {
     return expected_ == produced_;
   }
-  inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+  inline bool Append(const char* ip, size_t len) {
     produced_ += len;
     return produced_ <= expected_;
   }
-  inline bool AppendFromSelf(uint32 offset, uint32 len) {
+  inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
+    return false;
+  }
+  inline bool AppendFromSelf(size_t offset, size_t len) {
     if (produced_ <= offset - 1u) return false;  // -1u catches offset==0
     produced_ += len;
     return produced_ <= expected_;