diff options
Diffstat (limited to 'libgo/go/index/suffixarray/suffixarray_test.go')
-rw-r--r-- | libgo/go/index/suffixarray/suffixarray_test.go | 359 |
1 files changed, 335 insertions, 24 deletions
diff --git a/libgo/go/index/suffixarray/suffixarray_test.go b/libgo/go/index/suffixarray/suffixarray_test.go index 644f00c7577..28090de9aa2 100644 --- a/libgo/go/index/suffixarray/suffixarray_test.go +++ b/libgo/go/index/suffixarray/suffixarray_test.go @@ -6,7 +6,11 @@ package suffixarray import ( "bytes" + "fmt" + "io/ioutil" "math/rand" + "os" + "path/filepath" "regexp" "sort" "strings" @@ -207,10 +211,19 @@ func testLookups(t *testing.T, tc *testCase, x *Index, n int) { // index is used to hide the sort.Interface type index Index -func (x *index) Len() int { return len(x.sa) } +func (x *index) Len() int { return x.sa.len() } func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } -func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } -func (a *index) at(i int) []byte { return a.data[a.sa[i]:] } +func (x *index) Swap(i, j int) { + if x.sa.int32 != nil { + x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i] + } else { + x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i] + } +} + +func (x *index) at(i int) []byte { + return x.data[x.sa.get(i):] +} func testConstruction(t *testing.T, tc *testCase, x *Index) { if !sort.IsSorted((*index)(x)) { @@ -222,8 +235,12 @@ func equal(x, y *Index) bool { if !bytes.Equal(x.data, y.data) { return false } - for i, j := range x.sa { - if j != y.sa[i] { + if x.sa.len() != y.sa.len() { + return false + } + n := x.sa.len() + for i := 0; i < n; i++ { + if x.sa.get(i) != y.sa.get(i) { return false } } @@ -238,16 +255,41 @@ func testSaveRestore(t *testing.T, tc *testCase, x *Index) int { } size := buf.Len() var y Index - if err := y.Read(&buf); err != nil { + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + + old := maxData32 + defer func() { + maxData32 = old + }() + // Reread as forced 32. + y = Index{} + maxData32 = realMaxData32 + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { t.Errorf("failed reading index %s (%s)", tc.name, err) } if !equal(x, &y) { t.Errorf("restored index doesn't match saved index %s", tc.name) } + + // Reread as forced 64. + y = Index{} + maxData32 = -1 + if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil { + t.Errorf("failed reading index %s (%s)", tc.name, err) + } + if !equal(x, &y) { + t.Errorf("restored index doesn't match saved index %s", tc.name) + } + return size } -func TestIndex(t *testing.T) { +func testIndex(t *testing.T) { for _, tc := range testCases { x := New([]byte(tc.source)) testConstruction(t, &tc, x) @@ -260,45 +302,314 @@ func TestIndex(t *testing.T) { } } +func TestIndex32(t *testing.T) { + testIndex(t) +} + +func TestIndex64(t *testing.T) { + maxData32 = -1 + defer func() { + maxData32 = realMaxData32 + }() + testIndex(t) +} + +func TestNew32(t *testing.T) { + test(t, func(x []byte) []int { + sa := make([]int32, len(x)) + text_32(x, sa) + out := make([]int, len(sa)) + for i, v := range sa { + out[i] = int(v) + } + return out + }) +} + +func TestNew64(t *testing.T) { + test(t, func(x []byte) []int { + sa := make([]int64, len(x)) + text_64(x, sa) + out := make([]int, len(sa)) + for i, v := range sa { + out[i] = int(v) + } + return out + }) +} + +// test tests an arbitrary suffix array construction function. +// Generates many inputs, builds and checks suffix arrays. +func test(t *testing.T, build func([]byte) []int) { + t.Run("ababab...", func(t *testing.T) { + // Very repetitive input has numLMS = len(x)/2-1 + // at top level, the largest it can be. + // But maxID is only two (aba and ab$). + size := 100000 + if testing.Short() { + size = 10000 + } + x := make([]byte, size) + for i := range x { + x[i] = "ab"[i%2] + } + testSA(t, x, build) + }) + + t.Run("forcealloc", func(t *testing.T) { + // Construct a pathological input that forces + // recurse_32 to allocate a new temporary buffer. + // The input must have more than N/3 LMS-substrings, + // which we arrange by repeating an SLSLSLSLSLSL pattern + // like ababab... above, but then we must also arrange + // for a large number of distinct LMS-substrings. + // We use this pattern: + // 1 255 1 254 1 253 1 ... 1 2 1 255 2 254 2 253 2 252 2 ... + // This gives approximately 2¹⁵ distinct LMS-substrings. + // We need to repeat at least one substring, though, + // or else the recursion can be bypassed entirely. + x := make([]byte, 100000, 100001) + lo := byte(1) + hi := byte(255) + for i := range x { + if i%2 == 0 { + x[i] = lo + } else { + x[i] = hi + hi-- + if hi <= lo { + lo++ + if lo == 0 { + lo = 1 + } + hi = 255 + } + } + } + x[:cap(x)][len(x)] = 0 // for sais.New + testSA(t, x, build) + }) + + t.Run("exhaustive2", func(t *testing.T) { + // All inputs over {0,1} up to length 21. + // Runs in about 10 seconds on my laptop. + x := make([]byte, 30) + numFail := 0 + for n := 0; n <= 21; n++ { + if n > 12 && testing.Short() { + break + } + x[n] = 0 // for sais.New + testRec(t, x[:n], 0, 2, &numFail, build) + } + }) + + t.Run("exhaustive3", func(t *testing.T) { + // All inputs over {0,1,2} up to length 14. + // Runs in about 10 seconds on my laptop. + x := make([]byte, 30) + numFail := 0 + for n := 0; n <= 14; n++ { + if n > 8 && testing.Short() { + break + } + x[n] = 0 // for sais.New + testRec(t, x[:n], 0, 3, &numFail, build) + } + }) +} + +// testRec fills x[i:] with all possible combinations of values in [1,max] +// and then calls testSA(t, x, build) for each one. +func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) { + if i < len(x) { + for x[i] = 1; x[i] <= byte(max); x[i]++ { + testRec(t, x, i+1, max, numFail, build) + } + return + } + + if !testSA(t, x, build) { + *numFail++ + if *numFail >= 10 { + t.Errorf("stopping after %d failures", *numFail) + t.FailNow() + } + } +} + +// testSA tests the suffix array build function on the input x. +// It constructs the suffix array and then checks that it is correct. +func testSA(t *testing.T, x []byte, build func([]byte) []int) bool { + defer func() { + if e := recover(); e != nil { + t.Logf("build %v", x) + panic(e) + } + }() + sa := build(x) + if len(sa) != len(x) { + t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x)) + return false + } + for i := 0; i+1 < len(sa); i++ { + if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) { + t.Errorf("build %s: sa out of range: %v\n", x, sa) + return false + } + if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 { + t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1]) + return false + } + } + + return true +} + +var ( + benchdata = make([]byte, 1e6) + benchrand = make([]byte, 1e6) +) + // Of all possible inputs, the random bytes have the least amount of substring // repetition, and the repeated bytes have the most. For most algorithms, // the running time of every input will be between these two. func benchmarkNew(b *testing.B, random bool) { + b.ReportAllocs() b.StopTimer() - data := make([]byte, 1e6) + data := benchdata if random { - for i := range data { - data[i] = byte(rand.Intn(256)) + data = benchrand + if data[0] == 0 { + for i := range data { + data[i] = byte(rand.Intn(256)) + } } } b.StartTimer() + b.SetBytes(int64(len(data))) for i := 0; i < b.N; i++ { New(data) } } -func BenchmarkNewIndexRandom(b *testing.B) { - benchmarkNew(b, true) +func makeText(name string) ([]byte, error) { + var data []byte + switch name { + case "opticks": + var err error + data, err = ioutil.ReadFile("../../testdata/Isaac.Newton-Opticks.txt") + if err != nil { + return nil, err + } + case "go": + err := filepath.Walk("../..", func(path string, info os.FileInfo, err error) error { + if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() { + file, err := ioutil.ReadFile(path) + if err != nil { + return err + } + data = append(data, file...) + } + return nil + }) + if err != nil { + return nil, err + } + case "zero": + data = make([]byte, 50e6) + case "rand": + data = make([]byte, 50e6) + for i := range data { + data[i] = byte(rand.Intn(256)) + } + } + return data, nil } -func BenchmarkNewIndexRepeat(b *testing.B) { - benchmarkNew(b, false) + +func setBits(bits int) (cleanup func()) { + if bits == 32 { + maxData32 = realMaxData32 + } else { + maxData32 = -1 // force use of 64-bit code + } + return func() { + maxData32 = realMaxData32 + } +} + +func BenchmarkNew(b *testing.B) { + for _, text := range []string{"opticks", "go", "zero", "rand"} { + b.Run("text="+text, func(b *testing.B) { + data, err := makeText(text) + if err != nil { + b.Fatal(err) + } + if testing.Short() && len(data) > 5e6 { + data = data[:5e6] + } + for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} { + if len(data) < size { + continue + } + data := data[:size] + name := fmt.Sprintf("%dK", size/1e3) + if size >= 1e6 { + name = fmt.Sprintf("%dM", size/1e6) + } + b.Run("size="+name, func(b *testing.B) { + for _, bits := range []int{32, 64} { + if ^uint(0) == 0xffffffff && bits == 64 { + continue + } + b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) { + cleanup := setBits(bits) + defer cleanup() + + b.SetBytes(int64(len(data))) + b.ReportAllocs() + for i := 0; i < b.N; i++ { + New(data) + } + }) + } + }) + } + }) + } } func BenchmarkSaveRestore(b *testing.B) { - b.StopTimer() r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence data := make([]byte, 1<<20) // 1MB of data to index for i := range data { data[i] = byte(r.Intn(256)) } - x := New(data) - size := testSaveRestore(nil, nil, x) // verify correctness - buf := bytes.NewBuffer(make([]byte, size)) // avoid growing - b.SetBytes(int64(size)) - b.StartTimer() - for i := 0; i < b.N; i++ { - x.Write(buf) - var y Index - y.Read(buf) + for _, bits := range []int{32, 64} { + if ^uint(0) == 0xffffffff && bits == 64 { + continue + } + b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) { + cleanup := setBits(bits) + defer cleanup() + + b.StopTimer() + x := New(data) + size := testSaveRestore(nil, nil, x) // verify correctness + buf := bytes.NewBuffer(make([]byte, size)) // avoid growing + b.SetBytes(int64(size)) + b.StartTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + buf.Reset() + if err := x.Write(buf); err != nil { + b.Fatal(err) + } + var y Index + if err := y.Read(buf); err != nil { + b.Fatal(err) + } + } + }) } } |