Fast percentile implementation

2014-07-22 12:26:31 -04:00 · 2014-07-22 12:26:31 -04:00 · a162f38281
commit a162f38281
parent 2f4d0b0f9a
2 changed files with 97 additions and 20 deletions
--- a/stats/percentile.go
+++ b/stats/percentile.go
@ -1,25 +1,60 @@
 package stats

 import (
+	"math"
 	"sort"
 )

 type Percentile struct {
 	percentile float64
-	values sort.Float64Slice
-	offset int
+
+	samples int64
+	offset  int64
+	values  []float64
 }

 func NewPercentile(percentile float64, sampleWindow int) *Percentile {
 	return &Percentile{
 		percentile: percentile,
-		values: make([]float64, 0, sampleWindow),
+		values:     make([]float64, 0, sampleWindow),
 	}
 }

+// Not thread safe.
 func (p *Percentile) AddSample(sample float64) {
-	p.values = append(p.values, sample)
-	sort.Sort(p.values)
+	p.samples++
+
+	if p.samples > int64(cap(p.values)) {
+		target := float64(p.samples)*p.percentile - float64(cap(p.values))/2
+		offset := round(math.Max(target, 0))
+
+		if sample > p.values[0] {
+			if offset > p.offset {
+				idx := sort.SearchFloat64s(p.values[1:], sample)
+				copy(p.values, p.values[1:idx+1])
+
+				p.values[idx] = sample
+				p.offset++
+			} else if sample < p.values[len(p.values)-1] {
+				idx := sort.SearchFloat64s(p.values, sample)
+				copy(p.values[idx+1:], p.values[idx:])
+
+				p.values[idx] = sample
+			}
+		} else {
+			if offset > p.offset {
+				p.offset++
+			} else {
+				copy(p.values[1:], p.values)
+				p.values[0] = sample
+			}
+		}
+	} else {
+		idx := sort.SearchFloat64s(p.values, sample)
+		p.values = p.values[:len(p.values)+1]
+		copy(p.values[idx+1:], p.values[idx:])
+		p.values[idx] = sample
+	}
 }

 func (p *Percentile) Value() float64 {
@ -27,11 +62,18 @@ func (p *Percentile) Value() float64 {
 		return 0
 	}

-	return p.values[round(p.index())]
+	return p.values[p.index()]
 }

-func (p *Percentile) index() float64 {
-	return float64(len(p.values)) * p.percentile - float64(p.offset)
+func (p *Percentile) index() int64 {
+	idx := round(float64(p.samples)*p.percentile - float64(p.offset))
+	last := int64(len(p.values)) - 1
+
+	if idx > last {
+		return last
+	}
+
+	return idx
 }

 func round(value float64) int64 {
--- a/stats/percentile_test.go
+++ b/stats/percentile_test.go
@ -1,30 +1,65 @@
 package stats

 import (
-	"testing"
 	"math/rand"
+	"testing"
+	"time"
 )

 func TestPercentiles(t *testing.T) {
-	testInRange(t, 1, 0.5)
-	testInRange(t, 1, 0.9)
-	testInRange(t, 1, 0.95)
-	testInRange(t, 10000, 0.5)
-	testInRange(t, 10000, 0.9)
-	testInRange(t, 10000, 0.95)
+	rand.Seed(time.Now().Unix())
+
+	testUniformRandom(t, 1, 0.5)
+	testUniformRandom(t, 1, 0.9)
+	testUniformRandom(t, 1, 0.95)
+	testUniformRandom(t, 10000, 0.5)
+	testUniformRandom(t, 10000, 0.9)
+	testUniformRandom(t, 10000, 0.95)
 }

-func testInRange(t *testing.T, max, percentile float64) {
-	p := NewPercentile(percentile, 10)
+func testUniformRandom(t *testing.T, max, percentile float64) {
+	p := NewPercentile(percentile, 256)

-	for i := 0; i < 1000; i++ {
+	for i := 0; i < 100000; i++ {
 		p.AddSample(rand.Float64() * max)
 	}

 	got := p.Value()
 	expected := percentile * max
+	maxError := 0.01

-	if got < expected * (1 - 0.02) || got > expected * (1 + 0.02) {
-		t.Errorf("Percentile out of range\n  actual: %f\nexpected: %f", got, expected)
+	if got < expected*(1-maxError) || got > expected*(1+maxError) {
+		t.Errorf("Percentile out of range\n  actual: %f\nexpected: %f\n   error: %f%%\n", got, expected, (got-expected)/expected*100)
+	}
+}
+
+func BenchmarkPercentiles64(b *testing.B) {
+	benchmarkUniformRandom(b, 64, 0.5)
+}
+
+func BenchmarkPercentiles128(b *testing.B) {
+	benchmarkUniformRandom(b, 128, 0.5)
+}
+
+func BenchmarkPercentiles256(b *testing.B) {
+	benchmarkUniformRandom(b, 256, 0.5)
+}
+
+func BenchmarkPercentiles512(b *testing.B) {
+	benchmarkUniformRandom(b, 512, 0.5)
+}
+
+func benchmarkUniformRandom(b *testing.B, window int, percentile float64) {
+	p := NewPercentile(percentile, window)
+
+	numbers := make([]float64, b.N)
+
+	for i := 0; i < b.N; i++ {
+		numbers[i] = rand.Float64()
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		p.AddSample(numbers[i])
 	}
 }