Add two HyperLogLog functions

author Alvaro Herrera <alvherre@alvh.no-ip.org>

Tue, 19 Jan 2016 20:40:15 +0000 (17:40 -0300)

committer Alvaro Herrera <alvherre@alvh.no-ip.org>

Tue, 19 Jan 2016 20:40:15 +0000 (17:40 -0300)
author Alvaro Herrera <alvherre@alvh.no-ip.org>
Tue, 19 Jan 2016 20:40:15 +0000 (17:40 -0300)
committer Alvaro Herrera <alvherre@alvh.no-ip.org>
Tue, 19 Jan 2016 20:40:15 +0000 (17:40 -0300)
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c

index 094bc09a44c896b3facd443a18efec68bfcd54ab..fa7f05a2411fd502a3576905ad580ab5cd1b1d87 100644 (file)
--- a/src/backend/lib/hyperloglog.c
+++ b/src/backend/lib/hyperloglog.c
@@ -56,7 +56,7 @@
  static inline uint8 rho(uint32 x, uint8 b);
  
  /*
- * Initialize HyperLogLog track state
+ * Initialize HyperLogLog track state, by bit width
   *
   * bwidth is bit width (so register size will be 2 to the power of bwidth).
   * Must be between 4 and 16 inclusive.
@@ -107,6 +107,52 @@ initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
         cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
  }
  
+/*
+ * Initialize HyperLogLog track state, by error rate
+ *
+ * Instead of specifying bwidth (number of bits used for addressing the
+ * register), this method allows sizing the counter for particular error
+ * rate using a simple formula from the paper:
+ *
+ *      e = 1.04 / sqrt(m)
+ *
+ * where 'm' is the number of registers, i.e. (2^bwidth). The method
+ * finds the lowest bwidth with 'e' below the requested error rate, and
+ * then uses it to initialize the counter.
+ *
+ * As bwidth has to be between 4 and 16, the worst possible error rate
+ * is between ~25% (bwidth=4) and 0.4% (bwidth=16).
+ */
+void
+initHyperLogLogError(hyperLogLogState *cState, double error)
+{
+       uint8           bwidth = 4;
+
+       while (bwidth < 16)
+       {
+               double          m = (Size) 1 << bwidth;
+
+               if (1.04 / sqrt(m) < error)
+                       break;
+               bwidth++;
+       }
+
+       initHyperLogLog(cState, bwidth);
+}
+
+/*
+ * Free HyperLogLog track state
+ *
+ * Releases allocated resources, but not the state itself (in case it's not
+ * allocated by palloc).
+ */
+void
+freeHyperLogLog(hyperLogLogState *cState)
+{
+       Assert(cState->hashesArr != NULL);
+       pfree(cState->hashesArr);
+}
+
  /*
   * Adds element to the estimator, from caller-supplied hash.
   *
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h

index 5a1d4d356aae1b7843bbf4c59c6e34b0e4ca0930..b999b3056a679777681c13a3754d6bfe2e010645 100644 (file)
--- a/src/include/lib/hyperloglog.h
+++ b/src/include/lib/hyperloglog.h
@@ -60,8 +60,10 @@ typedef struct hyperLogLogState
  } hyperLogLogState;
  
  extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+extern void initHyperLogLogError(hyperLogLogState *cState, double error);
  extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash);
  extern double estimateHyperLogLog(hyperLogLogState *cState);
  extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+extern void freeHyperLogLog(hyperLogLogState *cState);
  
  #endif   /* HYPERLOGLOG_H */
author	Alvaro Herrera <alvherre@alvh.no-ip.org>
	Tue, 19 Jan 2016 20:40:15 +0000 (17:40 -0300)
committer	Alvaro Herrera <alvherre@alvh.no-ip.org>
	Tue, 19 Jan 2016 20:40:15 +0000 (17:40 -0300)
src/backend/lib/hyperloglog.c		patch \| blob \| history
src/include/lib/hyperloglog.h		patch \| blob \| history