[PDB] Hash types up front when merging types instead of using StringMap

author Reid Kleckner <rnk@google.com>

Tue, 23 May 2017 18:23:59 +0000 (18:23 +0000)

committer Reid Kleckner <rnk@google.com>

Tue, 23 May 2017 18:23:59 +0000 (18:23 +0000)
author Reid Kleckner <rnk@google.com>
Tue, 23 May 2017 18:23:59 +0000 (18:23 +0000)
committer Reid Kleckner <rnk@google.com>
Tue, 23 May 2017 18:23:59 +0000 (18:23 +0000)
diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h

index 6dad98247136fb5fc7ed70da8f3d6911ef8c2706..deeb063c5dfcec246218c0681703ade832f2fc36 100644 (file)
--- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
@@ -17,7 +17,6 @@
  
  #include "llvm/ADT/Optional.h"
  #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
  #include "llvm/ADT/StringRef.h"
  #include "llvm/Support/Allocator.h"
  #include "llvm/Support/Error.h"
@@ -26,6 +25,8 @@ namespace llvm {
  
  namespace codeview {
  
+class TypeHasher;
+
  class TypeSerializer : public TypeVisitorCallbacks {
    struct SubRecord {
      SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {}
@@ -45,14 +46,13 @@ class TypeSerializer : public TypeVisitorCallbacks {
      }
    };
  
-  typedef SmallVector<MutableArrayRef<uint8_t>, 2> RecordList;
+  typedef SmallVector<MutableArrayRef<uint8_t>, 2> MutableRecordList;
  
    static constexpr uint8_t ContinuationLength = 8;
    BumpPtrAllocator &RecordStorage;
    RecordSegment CurrentSegment;
-  RecordList FieldListSegments;
+  MutableRecordList FieldListSegments;
  
-  TypeIndex LastTypeIndex;
    Optional<TypeLeafKind> TypeKind;
    Optional<TypeLeafKind> MemberKind;
    std::vector<uint8_t> RecordBuffer;
@@ -60,28 +60,23 @@ class TypeSerializer : public TypeVisitorCallbacks {
    BinaryStreamWriter Writer;
    TypeRecordMapping Mapping;
  
-  RecordList SeenRecords;
-  StringMap<TypeIndex> HashedRecords;
+  /// Private type record hashing implementation details are handled here.
+  std::unique_ptr<TypeHasher> Hasher;
  
    bool isInFieldList() const;
-  TypeIndex calcNextTypeIndex() const;
-  TypeIndex incrementTypeIndex();
    MutableArrayRef<uint8_t> getCurrentSubRecordData();
    MutableArrayRef<uint8_t> getCurrentRecordData();
    Error writeRecordPrefix(TypeLeafKind Kind);
-  TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
-  TypeIndex insertRecordBytesWithCopy(CVType &Record,
-                                      MutableArrayRef<uint8_t> Data);
  
    Expected<MutableArrayRef<uint8_t>>
    addPadding(MutableArrayRef<uint8_t> Record);
  
  public:
    explicit TypeSerializer(BumpPtrAllocator &Storage);
+  ~TypeSerializer();
  
-  ArrayRef<MutableArrayRef<uint8_t>> records() const;
-  TypeIndex getLastTypeIndex() const;
-  TypeIndex insertRecordBytes(MutableArrayRef<uint8_t> Record);
+  ArrayRef<ArrayRef<uint8_t>> records() const;
+  TypeIndex insertRecordBytes(ArrayRef<uint8_t> Record);
    Expected<TypeIndex> visitTypeEndGetIndex(CVType &Record);
  
    Error visitTypeBegin(CVType &Record) override;
diff --git a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h

index 102bee4b0801e1a316b05ef7bd9c46fe2b145af3..de7bc44ae91fb0f5a2ea21ce84f76e398c7a7b38 100644 (file)
--- a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
+++ b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
@@ -64,7 +64,7 @@ public:
      return *ExpectedIndex;
    }
  
-  TypeIndex writeSerializedRecord(MutableArrayRef<uint8_t> Record) {
+  TypeIndex writeSerializedRecord(ArrayRef<uint8_t> Record) {
      return Serializer.insertRecordBytes(Record);
    }
  
@@ -77,9 +77,7 @@ public:
      }
    }
  
-  ArrayRef<MutableArrayRef<uint8_t>> records() const {
-    return Serializer.records();
-  }
+  ArrayRef<ArrayRef<uint8_t>> records() const { return Serializer.records(); }
  };
  
  class FieldListRecordBuilder {
diff --git a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h

index 7de562a19a7418e787d3becf4ed0ad7fa869a8a2..42b62ba2b6ce55b229c6a1aca39f2bf3ee54a99e 100644 (file)
--- a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
+++ b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
@@ -18,7 +18,7 @@ namespace codeview {
  
  class TypeTableCollection : public TypeCollection {
  public:
-  explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
+  explicit TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records);
  
    Optional<TypeIndex> getFirst() override;
    Optional<TypeIndex> getNext(TypeIndex Prev) override;
@@ -33,7 +33,7 @@ private:
    bool hasCapacityFor(TypeIndex Index) const;
    void ensureTypeExists(TypeIndex Index);
  
-  ArrayRef<MutableArrayRef<uint8_t>> Records;
+  ArrayRef<ArrayRef<uint8_t>> Records;
    TypeDatabase Database;
  };
  }
diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp

index 3b061e67e05ec6ced813848d6beb036142c70b1b..afe70a2e8552a85964a931b6d4307ed9c38bb872 100644 (file)
--- a/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -9,6 +9,7 @@
  
  #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
  
+#include "llvm/ADT/DenseSet.h"
  #include "llvm/Support/BinaryStreamWriter.h"
  
  #include <string.h>
@@ -16,21 +17,116 @@
  using namespace llvm;
  using namespace llvm::codeview;
  
-bool TypeSerializer::isInFieldList() const {
-  return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
-}
+namespace {
+struct HashedType {
+  uint64_t Hash;
+  const uint8_t *Data;
+  unsigned Size; // FIXME: Go to uint16_t?
+  TypeIndex Index;
+};
+
+/// Wrapper around a poitner to a HashedType. Hash and equality operations are
+/// based on data in the pointee.
+struct HashedTypePtr {
+  HashedTypePtr() = default;
+  HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
+  HashedType *Ptr = nullptr;
+};
+} // namespace
+
+template <> struct DenseMapInfo<HashedTypePtr> {
+  static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
+  static inline HashedTypePtr getTombstoneKey() {
+    return HashedTypePtr(reinterpret_cast<HashedType *>(1));
+  }
+  static unsigned getHashValue(HashedTypePtr Val) {
+    assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
+    return Val.Ptr->Hash;
+  }
+  static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
+    HashedType *LHS = LHSP.Ptr;
+    HashedType *RHS = RHSP.Ptr;
+    if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
+      return LHS == RHS;
+    if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
+      return false;
+    return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
+  }
+};
+
+/// Private implementation so that we don't leak our DenseMap instantiations to
+/// users.
+class llvm::codeview::TypeHasher {
+private:
+  /// Storage for type record provided by the caller. Records will outlive the
+  /// hasher object, so they should be allocated here.
+  BumpPtrAllocator &RecordStorage;
+
+  /// Storage for hash keys. These only need to live as long as the hashing
+  /// operation.
+  BumpPtrAllocator KeyStorage;
+
+  /// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
+  /// but DenseMap is inefficient when the keys are long (like type records)
+  /// because it recomputes the hash value of every key when it grows. This
+  /// value type stores the hash out of line in KeyStorage, so that table
+  /// entries are small and easy to rehash.
+  DenseSet<HashedTypePtr> HashedRecords;
+
+  SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;
+
+  TypeIndex NextTypeIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
+
+public:
+  TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}
+
+  ArrayRef<ArrayRef<uint8_t>> records() const { return SeenRecords; }
+
+  /// Takes the bytes of type record, inserts them into the hash table, saves
+  /// them, and returns a pointer to an identical stable type record along with
+  /// its type index in the destination stream.
+  TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record);
+};
+
+TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record) {
+  assert(Record.size() < UINT32_MAX && "Record too big");
+  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
+
+  // Compute the hash up front so we can store it in the key.
+  HashedType TempHashedType = {hash_value(Record), Record.data(),
+                               unsigned(Record.size()), NextTypeIndex};
+
+  auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
+  HashedType *&Hashed = Result.first->Ptr;
+
+  if (Result.second) {
+    // This was a new type record. We need stable storage for both the key and
+    // the record. The record should outlive the hashing operation.
+    Hashed = KeyStorage.Allocate<HashedType>();
+    *Hashed = TempHashedType;
+
+    uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+    memcpy(Stable, Record.data(), Record.size());
+    Hashed->Data = Stable;
+    assert(Hashed->Size == Record.size());
+
+    // This was a new record, so increment our next type index.
+    ++NextTypeIndex;
+  }
+
+  // Update the caller's copy of Record to point a stable copy.
+  Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);
+
+  if (Result.second) {
+    // FIXME: Can we record these in a more efficient way?
+    SeenRecords.push_back(Record);
+  }
  
-TypeIndex TypeSerializer::calcNextTypeIndex() const {
-  if (LastTypeIndex.isNoneType())
-    return TypeIndex(TypeIndex::FirstNonSimpleIndex);
-  else
-    return TypeIndex(LastTypeIndex.getIndex() + 1);
+  return TypeIndex(Hashed->Index);
  }
  
-TypeIndex TypeSerializer::incrementTypeIndex() {
-  TypeIndex Previous = LastTypeIndex;
-  LastTypeIndex = calcNextTypeIndex();
-  return Previous;
+bool TypeSerializer::isInFieldList() const {
+  return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
  }
  
  MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
@@ -51,46 +147,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) {
    return Error::success();
  }
  
-TypeIndex
-TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
-  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
-  StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
-
-  TypeIndex NextTypeIndex = calcNextTypeIndex();
-  auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
-  if (Result.second) {
-    LastTypeIndex = NextTypeIndex;
-    SeenRecords.push_back(Record);
-  }
-  return Result.first->getValue();
-}
-
-TypeIndex
-TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
-                                          MutableArrayRef<uint8_t> Data) {
-  assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
-  StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());
-
-  // Do a two state lookup / insert so that we don't have to allocate unless
-  // we're going
-  // to do an insert.  This is a big memory savings.
-  auto Iter = HashedRecords.find(S);
-  if (Iter != HashedRecords.end())
-    return Iter->second;
-
-  LastTypeIndex = calcNextTypeIndex();
-  uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
-  ::memcpy(Copy, Data.data(), Data.size());
-  Data = MutableArrayRef<uint8_t>(Copy, Data.size());
-  S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
-  HashedRecords.insert(std::make_pair(S, LastTypeIndex));
-  SeenRecords.push_back(Data);
-  Record.RecordData = Data;
-  return LastTypeIndex;
-}
-
  Expected<MutableArrayRef<uint8_t>>
  TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
    uint32_t Align = Record.size() % 4;
@@ -109,26 +165,25 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
  }
  
  TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
-    : RecordStorage(Storage), LastTypeIndex(),
-      RecordBuffer(MaxRecordLength * 2),
+    : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
        Stream(RecordBuffer, llvm::support::little), Writer(Stream),
-      Mapping(Writer) {
+      Mapping(Writer), Hasher(make_unique<TypeHasher>(Storage)) {
    // RecordBuffer needs to be able to hold enough data so that if we are 1
    // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
    // we won't overflow.
  }
  
-ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
-  return SeenRecords;
-}
+TypeSerializer::~TypeSerializer() = default;
  
-TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
+ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
+  return Hasher->records();
+}
  
-TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
+TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> Record) {
    assert(!TypeKind.hasValue() && "Already in a type mapping!");
    assert(Writer.getOffset() == 0 && "Stream has data already!");
  
-  return insertRecordBytesPrivate(Record);
+  return Hasher->getOrCreateRecord(Record);
  }
  
  Error TypeSerializer::visitTypeBegin(CVType &Record) {
@@ -163,8 +218,8 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
    Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
  
    Record.Type = *TypeKind;
-  TypeIndex InsertedTypeIndex =
-      insertRecordBytesWithCopy(Record, ThisRecordData);
+  Record.RecordData = ThisRecordData;
+  TypeIndex InsertedTypeIndex = Hasher->getOrCreateRecord(Record.RecordData);
  
    // Write out each additional segment in reverse order, and update each
    // record's continuation index to point to the previous one.
@@ -174,7 +229,7 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
          reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
      assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
      *CI = InsertedTypeIndex.getIndex();
-    InsertedTypeIndex = insertRecordBytesPrivate(X);
+    InsertedTypeIndex = Hasher->getOrCreateRecord(X);
    }
  
    TypeKind.reset();
diff --git a/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/lib/DebugInfo/CodeView/TypeTableCollection.cpp

index a18710d6ab52c77bcd58a81dcf20c91e2ad952f1..699694fde9284f68fc2289c62f4b6528a2d532fa 100644 (file)
--- a/lib/DebugInfo/CodeView/TypeTableCollection.cpp
+++ b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -24,8 +24,7 @@ static void error(Error &&EC) {
      consumeError(std::move(EC));
  }
  
-TypeTableCollection::TypeTableCollection(
-    ArrayRef<MutableArrayRef<uint8_t>> Records)
+TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
      : Records(Records), Database(Records.size()) {}
  
  Optional<TypeIndex> TypeTableCollection::getFirst() {
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp

index 6b6a0ef046a22d6253cf536e3d429850c51a6342..64688559323e62b8283b65a76a7580a4d6b4d66c 100644 (file)
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -877,14 +877,12 @@ static void mergePdbs() {
  
    auto &DestTpi = Builder.getTpiBuilder();
    auto &DestIpi = Builder.getIpiBuilder();
-  MergedTpi.ForEachRecord(
-      [&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
-        DestTpi.addTypeRecord(Data, None);
-      });
-  MergedIpi.ForEachRecord(
-      [&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
-        DestIpi.addTypeRecord(Data, None);
-      });
+  MergedTpi.ForEachRecord([&DestTpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
+    DestTpi.addTypeRecord(Data, None);
+  });
+  MergedIpi.ForEachRecord([&DestIpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
+    DestIpi.addTypeRecord(Data, None);
+  });
  
    SmallString<64> OutFile(opts::merge::PdbOutputFile);
    if (OutFile.empty()) {
author	Reid Kleckner <rnk@google.com>
	Tue, 23 May 2017 18:23:59 +0000 (18:23 +0000)
committer	Reid Kleckner <rnk@google.com>
	Tue, 23 May 2017 18:23:59 +0000 (18:23 +0000)
include/llvm/DebugInfo/CodeView/TypeSerializer.h		patch \| blob \| history
include/llvm/DebugInfo/CodeView/TypeTableBuilder.h		patch \| blob \| history
include/llvm/DebugInfo/CodeView/TypeTableCollection.h		patch \| blob \| history
lib/DebugInfo/CodeView/TypeSerializer.cpp		patch \| blob \| history
lib/DebugInfo/CodeView/TypeTableCollection.cpp		patch \| blob \| history
tools/llvm-pdbdump/llvm-pdbdump.cpp		patch \| blob \| history