libgo: update to Go1.10beta1

Update the Go library to the 1.10beta1 release. Requires a few changes to the compiler for modifications to the map runtime code, and to handle some nowritebarrier cases in the runtime. Reviewed-on: https://go-review.googlesource.com/86455 gotools/: * Makefile.am (go_cmd_vet_files): New variable. (go_cmd_buildid_files, go_cmd_test2json_files): New variables. (s-zdefaultcc): Change from constants to functions. (noinst_PROGRAMS): Add vet, buildid, and test2json. (cgo$(EXEEXT)): Link against $(LIBGOTOOL). (vet$(EXEEXT)): New target. (buildid$(EXEEXT)): New target. (test2json$(EXEEXT)): New target. (install-exec-local): Install all $(noinst_PROGRAMS). (uninstall-local): Uninstasll all $(noinst_PROGRAMS). (check-go-tool): Depend on $(noinst_PROGRAMS). Copy down objabi.go. (check-runtime): Depend on $(noinst_PROGRAMS). (check-cgo-test, check-carchive-test): Likewise. (check-vet): New target. (check): Depend on check-vet. Look at cmd_vet-testlog. (.PHONY): Add check-vet. * Makefile.in: Rebuild. From-SVN: r256365
author: Ian Lance Taylor <iant@golang.org> 2018-01-09 01:23:08 +0000
committer: Ian Lance Taylor <ian@gcc.gnu.org> 2018-01-09 01:23:08 +0000
commit: 1a2f01efa63036a5104f203a4789e682c0e0915d (patch)
tree: 373e15778dc8295354584e1f86915ae493b604ff /libgo/go/runtime
parent: 8799df67f2dab88f9fda11739c501780a85575e2 (diff)
94 files changed, 4855 insertions, 1973 deletions
diff --git a/libgo/go/runtime/alg.go b/libgo/go/runtime/alg.go
index 174320fe85a..7c98f1bc940 100644
--- a/libgo/go/runtime/alg.go
+++ b/libgo/go/runtime/alg.go
@@ -57,18 +57,15 @@ const (
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
+
 func memhash8(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 1)
 }
+
 func memhash16(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 2)
 }
-func memhash32(p unsafe.Pointer, h uintptr) uintptr {
-	return memhash(p, h, 4)
-}
-func memhash64(p unsafe.Pointer, h uintptr) uintptr {
-	return memhash(p, h, 8)
-}
+
 func memhash128(p unsafe.Pointer, h uintptr) uintptr {
 	return memhash(p, h, 16)
 }
diff --git a/libgo/go/runtime/append_test.go b/libgo/go/runtime/append_test.go
index 6bd8f3bd951..ef1e812c0dc 100644
--- a/libgo/go/runtime/append_test.go
+++ b/libgo/go/runtime/append_test.go
@@ -18,42 +18,52 @@ func BenchmarkMakeSlice(b *testing.B) {
 	}
 }
 
-func BenchmarkGrowSliceBytes(b *testing.B) {
-	b.StopTimer()
-	var x = make([]byte, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]byte(nil), x...)
-	}
-}
-
-func BenchmarkGrowSliceInts(b *testing.B) {
-	b.StopTimer()
-	var x = make([]int, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]int(nil), x...)
-	}
-}
-
-func BenchmarkGrowSlicePtr(b *testing.B) {
-	b.StopTimer()
-	var x = make([]*byte, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]*byte(nil), x...)
-	}
-}
+type (
+	struct24 struct{ a, b, c int64 }
+	struct32 struct{ a, b, c, d int64 }
+	struct40 struct{ a, b, c, d, e int64 }
+)
 
-type struct24 struct{ a, b, c int64 }
+func BenchmarkGrowSlice(b *testing.B) {
+	b.Run("Byte", func(b *testing.B) {
+		x := make([]byte, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]byte(nil), x...)
+		}
+	})
+	b.Run("Int", func(b *testing.B) {
+		x := make([]int, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]int(nil), x...)
+		}
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		x := make([]*byte, 9)
+		for i := 0; i < b.N; i++ {
+			_ = append([]*byte(nil), x...)
+		}
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("24", func(b *testing.B) {
+			x := make([]struct24, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct24(nil), x...)
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			x := make([]struct32, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct32(nil), x...)
+			}
+		})
+		b.Run("40", func(b *testing.B) {
+			x := make([]struct40, 9)
+			for i := 0; i < b.N; i++ {
+				_ = append([]struct40(nil), x...)
+			}
+		})
 
-func BenchmarkGrowSliceStruct24Bytes(b *testing.B) {
-	b.StopTimer()
-	var x = make([]struct24, 9)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		_ = append([]struct24(nil), x...)
-	}
+	})
 }
 
 func BenchmarkAppend(b *testing.B) {
diff --git a/libgo/go/runtime/cgo_gccgo.go b/libgo/go/runtime/cgo_gccgo.go
index c3bf9552ea8..05be4964500 100644
--- a/libgo/go/runtime/cgo_gccgo.go
+++ b/libgo/go/runtime/cgo_gccgo.go
@@ -27,6 +27,13 @@ var iscgo bool
 // The extra M must be created before any C/C++ code calls cgocallback.
 var cgoHasExtraM bool
 
+// cgoAlwaysFalse is a boolean value that is always false.
+// The cgo-generated code says if cgoAlwaysFalse { cgoUse(p) }.
+// The compiler cannot see that cgoAlwaysFalse is always false,
+// so it emits the test and keeps the call, giving the desired
+// escape analysis result. The test is cheaper than the call.
+var cgoAlwaysFalse bool
+
 // Cgocall prepares to call from code written in Go to code written in
 // C/C++. This takes the current goroutine out of the Go scheduler, as
 // though it were making a system call. Otherwise the program can
@@ -37,12 +44,11 @@ var cgoHasExtraM bool
 //     defer syscall.Cgocalldone()
 //     cfunction()
 func Cgocall() {
-	lockOSThread()
 	mp := getg().m
 	mp.ncgocall++
 	mp.ncgo++
-	mp.incgo = true
 	entersyscall(0)
+	mp.incgo = true
 }
 
 // CgocallDone prepares to return to Go code from C/C++ code.
@@ -59,8 +65,6 @@ func CgocallDone() {
 	if readgstatus(gp)&^_Gscan == _Gsyscall {
 		exitsyscall(0)
 	}
-
-	unlockOSThread()
 }
 
 // CgocallBack is used when calling from C/C++ code into Go code.
@@ -78,6 +82,8 @@ func CgocallBack() {
 		mp.dropextram = true
 	}
 
+	lockOSThread()
+
 	exitsyscall(0)
 	gp.m.incgo = false
 
@@ -100,6 +106,8 @@ func CgocallBack() {
 // CgocallBackDone prepares to return to C/C++ code that has called
 // into Go code.
 func CgocallBackDone() {
+	unlockOSThread()
+
 	// If we are the top level Go function called from C/C++, then
 	// we need to release the m. But don't release it if we are
 	// panicing; since this is the top level, we are going to
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 4a416fbf6ad..9d161202dfa 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -234,10 +234,8 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 				// No more possible pointers.
 				break
 			}
-			if hbits.isPointer() {
-				if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
-					panic(errorString(msg))
-				}
+			if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+				panic(errorString(msg))
 			}
 			hbits = hbits.next()
 		}
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index 30f054b3633..b85b519460e 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -16,6 +16,10 @@ const cgoWriteBarrierFail = "Go pointer stored into non-Go memory"
 
 // cgoCheckWriteBarrier is called whenever a pointer is stored into memory.
 // It throws if the program is storing a Go pointer into non-Go memory.
+//
+// This is called from the write barrier, so its entire call tree must
+// be nosplit.
+//
 //go:nosplit
 //go:nowritebarrier
 func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index 7bb919c41db..8db728d5430 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -64,11 +64,19 @@ type waitq struct {
 }
 
 //go:linkname reflect_makechan reflect.makechan
-func reflect_makechan(t *chantype, size int64) *hchan {
+func reflect_makechan(t *chantype, size int) *hchan {
 	return makechan(t, size)
 }
 
-func makechan(t *chantype, size int64) *hchan {
+func makechan64(t *chantype, size int64) *hchan {
+	if int64(int(size)) != size {
+		panic(plainError("makechan: size out of range"))
+	}
+
+	return makechan(t, int(size))
+}
+
+func makechan(t *chantype, size int) *hchan {
 	elem := t.elem
 
 	// compiler checks this but be safe.
@@ -78,29 +86,33 @@ func makechan(t *chantype, size int64) *hchan {
 	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
 		throw("makechan: bad alignment")
 	}
-	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/elem.size) {
+
+	if size < 0 || uintptr(size) > maxSliceCap(elem.size) || uintptr(size)*elem.size > _MaxMem-hchanSize {
 		panic(plainError("makechan: size out of range"))
 	}
 
+	// Hchan does not contain pointers interesting for GC when elements stored in buf do not contain pointers.
+	// buf points into the same allocation, elemtype is persistent.
+	// SudoG's are referenced from their owning thread so they can't be collected.
+	// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
 	var c *hchan
-	if elem.kind&kindNoPointers != 0 || size == 0 {
-		// Allocate memory in one call.
-		// Hchan does not contain pointers interesting for GC in this case:
-		// buf points into the same allocation, elemtype is persistent.
-		// SudoG's are referenced from their owning thread so they can't be collected.
-		// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
+	switch {
+	case size == 0 || elem.size == 0:
+		// Queue or element size is zero.
+		c = (*hchan)(mallocgc(hchanSize, nil, true))
+		// Race detector uses this location for synchronization.
+		c.buf = unsafe.Pointer(c)
+	case elem.kind&kindNoPointers != 0:
+		// Elements do not contain pointers.
+		// Allocate hchan and buf in one call.
 		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, true))
-		if size > 0 && elem.size != 0 {
-			c.buf = add(unsafe.Pointer(c), hchanSize)
-		} else {
-			// race detector uses this location for synchronization
-			// Also prevents us from pointing beyond the allocation (see issue 9401).
-			c.buf = unsafe.Pointer(c)
-		}
-	} else {
+		c.buf = add(unsafe.Pointer(c), hchanSize)
+	default:
+		// Elements contain pointers.
 		c = new(hchan)
-		c.buf = newarray(elem, int(size))
+		c.buf = mallocgc(uintptr(size)*elem.size, elem, true)
 	}
+
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
 	c.dataqsiz = uint(size)
@@ -119,7 +131,7 @@ func chanbuf(c *hchan, i uint) unsafe.Pointer {
 // entry point for c <- x from compiled code
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
-	chansend(c, elem, true, getcallerpc(unsafe.Pointer(&c)))
+	chansend(c, elem, true, getcallerpc())
 }
 
 /*
@@ -223,7 +235,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	mysg.elem = ep
 	mysg.waitlink = nil
 	mysg.g = gp
-	mysg.selectdone = nil
+	mysg.isSelect = false
 	mysg.c = c
 	gp.waiting = mysg
 	gp.param = nil
@@ -331,7 +343,7 @@ func closechan(c *hchan) {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&c))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(c), callerpc, funcPC(closechan))
 		racerelease(unsafe.Pointer(c))
 	}
@@ -508,7 +520,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	mysg.waitlink = nil
 	gp.waiting = mysg
 	mysg.g = gp
-	mysg.selectdone = nil
+	mysg.isSelect = false
 	mysg.c = c
 	gp.param = nil
 	c.recvq.enqueue(mysg)
@@ -603,7 +615,7 @@ func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 //	}
 //
 func selectnbsend(c *hchan, elem unsafe.Pointer) (selected bool) {
-	return chansend(c, elem, false, getcallerpc(unsafe.Pointer(&c)))
+	return chansend(c, elem, false, getcallerpc())
 }
 
 // compiler implements
@@ -653,7 +665,7 @@ func selectnbrecv2(elem unsafe.Pointer, received *bool, c *hchan) (selected bool
 
 //go:linkname reflect_chansend reflect.chansend
 func reflect_chansend(c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
-	return chansend(c, elem, !nb, getcallerpc(unsafe.Pointer(&c)))
+	return chansend(c, elem, !nb, getcallerpc())
 }
 
 //go:linkname reflect_chanrecv reflect.chanrecv
@@ -712,10 +724,16 @@ func (q *waitq) dequeue() *sudog {
 			sgp.next = nil // mark as removed (see dequeueSudog)
 		}
 
-		// if sgp participates in a select and is already signaled, ignore it
-		if sgp.selectdone != nil {
-			// claim the right to signal
-			if *sgp.selectdone != 0 || !atomic.Cas(sgp.selectdone, 0, 1) {
+		// if a goroutine was put on this queue because of a
+		// select, there is a small window between the goroutine
+		// being woken up by a different case and it grabbing the
+		// channel locks. Once it has the lock
+		// it removes itself from the queue, so we won't see it after that.
+		// We use a flag in the G struct to tell us when someone
+		// else has won the race to signal this goroutine but the goroutine
+		// hasn't removed itself from the queue yet.
+		if sgp.isSelect {
+			if !atomic.Cas(&sgp.g.selectDone, 0, 1) {
 				continue
 			}
 		}
diff --git a/libgo/go/runtime/chan_test.go b/libgo/go/runtime/chan_test.go
index b96af8af5d7..29fb321c926 100644
--- a/libgo/go/runtime/chan_test.go
+++ b/libgo/go/runtime/chan_test.go
@@ -5,6 +5,8 @@
 package runtime_test
 
 import (
+	"internal/testenv"
+	"math"
 	"runtime"
 	"sync"
 	"sync/atomic"
@@ -435,6 +437,65 @@ func TestSelectStress(t *testing.T) {
 	wg.Wait()
 }
 
+func TestSelectFairness(t *testing.T) {
+	const trials = 10000
+	if runtime.GOOS == "linux" && runtime.GOARCH == "ppc64le" {
+		testenv.SkipFlaky(t, 22047)
+	}
+	c1 := make(chan byte, trials+1)
+	c2 := make(chan byte, trials+1)
+	for i := 0; i < trials+1; i++ {
+		c1 <- 1
+		c2 <- 2
+	}
+	c3 := make(chan byte)
+	c4 := make(chan byte)
+	out := make(chan byte)
+	done := make(chan byte)
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for {
+			var b byte
+			select {
+			case b = <-c3:
+			case b = <-c4:
+			case b = <-c1:
+			case b = <-c2:
+			}
+			select {
+			case out <- b:
+			case <-done:
+				return
+			}
+		}
+	}()
+	cnt1, cnt2 := 0, 0
+	for i := 0; i < trials; i++ {
+		switch b := <-out; b {
+		case 1:
+			cnt1++
+		case 2:
+			cnt2++
+		default:
+			t.Fatalf("unexpected value %d on channel", b)
+		}
+	}
+	// If the select in the goroutine is fair,
+	// cnt1 and cnt2 should be about the same value.
+	// With 10,000 trials, the expected margin of error at
+	// a confidence level of five nines is 4.4172 / (2 * Sqrt(10000)).
+	r := float64(cnt1) / trials
+	e := math.Abs(r - 0.5)
+	t.Log(cnt1, cnt2, r, e)
+	if e > 4.4172/(2*math.Sqrt(trials)) {
+		t.Errorf("unfair select: in %d trials, results were %d, %d", trials, cnt1, cnt2)
+	}
+	close(done)
+	wg.Wait()
+}
+
 func TestChanSendInterface(t *testing.T) {
 	type mt struct{}
 	m := &mt{}
@@ -674,6 +735,55 @@ done:
 	<-ready2
 }
 
+type struct0 struct{}
+
+func BenchmarkMakeChan(b *testing.B) {
+	b.Run("Byte", func(b *testing.B) {
+		var x chan byte
+		for i := 0; i < b.N; i++ {
+			x = make(chan byte, 8)
+		}
+		close(x)
+	})
+	b.Run("Int", func(b *testing.B) {
+		var x chan int
+		for i := 0; i < b.N; i++ {
+			x = make(chan int, 8)
+		}
+		close(x)
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		var x chan *byte
+		for i := 0; i < b.N; i++ {
+			x = make(chan *byte, 8)
+		}
+		close(x)
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("0", func(b *testing.B) {
+			var x chan struct0
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct0, 8)
+			}
+			close(x)
+		})
+		b.Run("32", func(b *testing.B) {
+			var x chan struct32
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct32, 8)
+			}
+			close(x)
+		})
+		b.Run("40", func(b *testing.B) {
+			var x chan struct40
+			for i := 0; i < b.N; i++ {
+				x = make(chan struct40, 8)
+			}
+			close(x)
+		})
+	})
+}
+
 func BenchmarkChanNonblocking(b *testing.B) {
 	myc := make(chan int)
 	b.RunParallel(func(pb *testing.PB) {
diff --git a/libgo/go/runtime/cpuprof.go b/libgo/go/runtime/cpuprof.go
index b031b1a5e75..91cdf2b5594 100644
--- a/libgo/go/runtime/cpuprof.go
+++ b/libgo/go/runtime/cpuprof.go
@@ -160,6 +160,7 @@ func (p *cpuProfile) addExtra() {
 			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
 		cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
+		p.lostExtra = 0
 	}
 }
 
diff --git a/libgo/go/runtime/cputicks.go b/libgo/go/runtime/cputicks.go
index ee15aca24ef..7e62dc1e108 100644
--- a/libgo/go/runtime/cputicks.go
+++ b/libgo/go/runtime/cputicks.go
@@ -4,6 +4,6 @@
 
 package runtime
 
-// careful: cputicks is not guaranteed to be monotonic!  In particular, we have
+// careful: cputicks is not guaranteed to be monotonic! In particular, we have
 // noticed drift between cpus on certain os/arch combinations. See issue 8976.
 func cputicks() int64
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index b79873185cc..7e14e573bc5 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -13,6 +13,7 @@ import (
 	"os"
 	"os/exec"
 	"runtime"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -113,7 +114,7 @@ func TestCgoExternalThreadSIGPROF(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
 	if err != nil {
 		t.Fatalf("exit status: %v\n%s", err, got)
 	}
@@ -136,7 +137,7 @@ func TestCgoExternalThreadSignal(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoExternalThreadSIGPROF")).CombinedOutput()
 	if err != nil {
 		t.Fatalf("exit status: %v\n%s", err, got)
 	}
@@ -203,14 +204,14 @@ func TestCgoCheckBytes(t *testing.T) {
 	const tries = 10
 	var tot1, tot2 time.Duration
 	for i := 0; i < tries; i++ {
-		cmd := testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd := testenv.CleanCmdEnv(exec.Command(exe, "CgoCheckBytes"))
 		cmd.Env = append(cmd.Env, "GODEBUG=cgocheck=0", fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
 		start := time.Now()
 		cmd.Run()
 		d1 := time.Since(start)
 
-		cmd = testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd = testenv.CleanCmdEnv(exec.Command(exe, "CgoCheckBytes"))
 		cmd.Env = append(cmd.Env, fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
 		start = time.Now()
@@ -251,7 +252,7 @@ func TestCgoCCodeSIGPROF(t *testing.T) {
 
 func TestCgoCrashTraceback(t *testing.T) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
 	}
 	if runtime.Compiler == "gccgo" {
@@ -279,7 +280,7 @@ func TestCgoTracebackContext(t *testing.T) {
 
 func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	t.Parallel()
-	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
 	}
 	if runtime.Compiler == "gccgo" {
@@ -292,7 +293,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, runArg)).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, runArg)).CombinedOutput()
 	if err != nil {
 		if testenv.Builder() == "linux-amd64-alpine" {
 			// See Issue 18243 and Issue 19938.
@@ -304,7 +305,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -339,7 +340,7 @@ func TestCgoPprof(t *testing.T) {
 }
 
 func TestCgoPprofPIE(t *testing.T) {
-	testCgoPprof(t, "-ldflags=-extldflags=-pie", "CgoPprof")
+	testCgoPprof(t, "-buildmode=pie", "CgoPprof")
 }
 
 func TestCgoPprofThread(t *testing.T) {
@@ -371,7 +372,7 @@ func TestRaceProf(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoRaceprof")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoRaceprof")).CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -400,7 +401,7 @@ func TestRaceSignal(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "CgoRaceSignal")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "CgoRaceSignal")).CombinedOutput()
 	if err != nil {
 		t.Logf("%s\n", got)
 		t.Fatal(err)
@@ -423,3 +424,72 @@ func TestCgoNumGoroutine(t *testing.T) {
 		t.Errorf("expected %q got %v", want, got)
 	}
 }
+
+func TestCatchPanic(t *testing.T) {
+	t.Parallel()
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	case "darwin":
+		if runtime.GOARCH == "amd64" {
+			t.Skipf("crash() on darwin/amd64 doesn't raise SIGABRT")
+		}
+	}
+
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprogcgo")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, early := range []bool{true, false} {
+		cmd := testenv.CleanCmdEnv(exec.Command(exe, "CgoCatchPanic"))
+		// Make sure a panic results in a crash.
+		cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
+		if early {
+			// Tell testprogcgo to install an early signal handler for SIGABRT
+			cmd.Env = append(cmd.Env, "CGOCATCHPANIC_EARLY_HANDLER=1")
+		}
+		if out, err := cmd.CombinedOutput(); err != nil {
+			t.Errorf("testprogcgo CgoCatchPanic failed: %v\n%s", err, out)
+		}
+	}
+}
+
+func TestCgoLockOSThreadExit(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no pthreads on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	testLockOSThreadExit(t, "testprogcgo")
+}
+
+func TestWindowsStackMemoryCgo(t *testing.T) {
+	if runtime.GOOS != "windows" {
+		t.Skip("skipping windows specific test")
+	}
+	testenv.SkipFlaky(t, 22575)
+	o := runTestProg(t, "testprogcgo", "StackMemory")
+	stackUsage, err := strconv.Atoi(o)
+	if err != nil {
+		t.Fatalf("Failed to read stack usage: %v", err)
+	}
+	if expected, got := 100<<10, stackUsage; got > expected {
+		t.Fatalf("expected < %d bytes of memory per thread, got %d", expected, got)
+	}
+}
+
+func TestSigStackSwapping(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skip("no sigaltstack on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "SigStack")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 1cde6bf7997..8ec034835ec 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -32,25 +32,6 @@ func TestMain(m *testing.M) {
 	os.Exit(status)
 }
 
-func testEnv(cmd *exec.Cmd) *exec.Cmd {
-	if cmd.Env != nil {
-		panic("environment already set")
-	}
-	for _, env := range os.Environ() {
-		// Exclude GODEBUG from the environment to prevent its output
-		// from breaking tests that are trying to parse other command output.
-		if strings.HasPrefix(env, "GODEBUG=") {
-			continue
-		}
-		// Exclude GOTRACEBACK for the same reason.
-		if strings.HasPrefix(env, "GOTRACEBACK=") {
-			continue
-		}
-		cmd.Env = append(cmd.Env, env)
-	}
-	return cmd
-}
-
 var testprog struct {
 	sync.Mutex
 	dir    string
@@ -62,7 +43,11 @@ type buildexe struct {
 	err error
 }
 
-func runTestProg(t *testing.T, binary, name string) string {
+func runTestProg(t *testing.T, binary, name string, env ...string) string {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
 	testenv.MustHaveGoBuild(t)
 
 	exe, err := buildTestProg(t, binary)
@@ -70,7 +55,11 @@ func runTestProg(t *testing.T, binary, name string) string {
 		t.Fatal(err)
 	}
 
-	cmd := testEnv(exec.Command(exe, name))
+	cmd := testenv.CleanCmdEnv(exec.Command(exe, name))
+	cmd.Env = append(cmd.Env, env...)
+	if testing.Short() {
+		cmd.Env = append(cmd.Env, "RUNTIME_TEST_SHORT=1")
+	}
 	var b bytes.Buffer
 	cmd.Stdout = &b
 	cmd.Stderr = &b
@@ -111,6 +100,10 @@ func runTestProg(t *testing.T, binary, name string) string {
 }
 
 func buildTestProg(t *testing.T, binary string, flags ...string) (string, error) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
 	checkStaleRuntime(t)
 
 	testprog.Lock()
@@ -139,7 +132,7 @@ func buildTestProg(t *testing.T, binary string, flags ...string) (string, error)
 	exe := filepath.Join(testprog.dir, name+".exe")
 	cmd := exec.Command(testenv.GoToolPath(t), append([]string{"build", "-o", exe}, flags...)...)
 	cmd.Dir = "testdata/" + binary
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		target.err = fmt.Errorf("building %s %v: %v\n%s", binary, flags, err, out)
 		testprog.target[name] = target
@@ -158,14 +151,14 @@ var (
 func checkStaleRuntime(t *testing.T) {
 	staleRuntimeOnce.Do(func() {
 		// 'go run' uses the installed copy of runtime.a, which may be out of date.
-		out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.Stale}}", "runtime")).CombinedOutput()
+		out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.Stale}}", "runtime")).CombinedOutput()
 		if err != nil {
 			staleRuntimeErr = fmt.Errorf("failed to execute 'go list': %v\n%v", err, string(out))
 			return
 		}
 		if string(out) != "false\n" {
 			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
-			out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
+			out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
 			if err != nil {
 				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
 			}
@@ -483,7 +476,7 @@ func TestMemPprof(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	got, err := testEnv(exec.Command(exe, "MemProf")).CombinedOutput()
+	got, err := testenv.CleanCmdEnv(exec.Command(exe, "MemProf")).CombinedOutput()
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -491,7 +484,7 @@ func TestMemPprof(t *testing.T) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -606,7 +599,7 @@ func TestPanicRace(t *testing.T) {
 	const tries = 10
 retry:
 	for i := 0; i < tries; i++ {
-		got, err := testEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
+		got, err := testenv.CleanCmdEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
 		if err == nil {
 			t.Logf("try %d: program exited successfully, should have failed", i+1)
 			continue
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 09c25471d10..584a6c74232 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -65,13 +65,13 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 
 	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
 	cmd.Dir = dir
-	out, err := testEnv(cmd).CombinedOutput()
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("building source: %v\n%s", err, out)
 	}
 
 	cmd = exec.Command(filepath.Join(dir, "a.exe"))
-	cmd = testEnv(cmd)
+	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
 
 	// Set GOGC=off. Because of golang.org/issue/10958, the tight
@@ -132,6 +132,7 @@ import (
 	"fmt"
 	"os"
 	"runtime"
+	"time"
 )
 
 func main() {
@@ -149,6 +150,8 @@ func main() {
 		<-c
 	}
 
+	time.Sleep(time.Millisecond)
+
 	// Tell our parent that all the goroutines are executing.
 	if _, err := os.NewFile(3, "pipe").WriteString("x"); err != nil {
 		fmt.Fprintf(os.Stderr, "write to pipe failed: %v\n", err)
@@ -184,7 +187,7 @@ func TestPanicSystemstack(t *testing.T) {
 
 	t.Parallel()
 	cmd := exec.Command(os.Args[0], "testPanicSystemstackInternal")
-	cmd = testEnv(cmd)
+	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
 	pr, pw, err := os.Pipe()
 	if err != nil {
@@ -249,7 +252,7 @@ func TestSignalExitStatus(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	err = testEnv(exec.Command(exe, "SignalExitStatus")).Run()
+	err = testenv.CleanCmdEnv(exec.Command(exe, "SignalExitStatus")).Run()
 	if err == nil {
 		t.Error("test program succeeded unexpectedly")
 	} else if ee, ok := err.(*exec.ExitError); !ok {
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index fdd73463aba..7cddd29ed0f 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -15,9 +15,6 @@ import (
 // The number of logical CPUs on the local machine can be queried with NumCPU.
 // This call will go away when the scheduler improves.
 func GOMAXPROCS(n int) int {
-	if n > _MaxGomaxprocs {
-		n = _MaxGomaxprocs
-	}
 	lock(&sched.lock)
 	ret := int(gomaxprocs)
 	unlock(&sched.lock)
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 6325dcb3948..e385f14c5bc 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -149,12 +149,19 @@ func RunSchedLocalQueueEmptyTest(iters int) {
 	}
 }
 
-var StringHash = stringHash
-var BytesHash = bytesHash
-var Int32Hash = int32Hash
-var Int64Hash = int64Hash
-var EfaceHash = efaceHash
-var IfaceHash = ifaceHash
+var (
+	StringHash = stringHash
+	BytesHash  = bytesHash
+	Int32Hash  = int32Hash
+	Int64Hash  = int64Hash
+	MemHash    = memhash
+	MemHash32  = memhash32
+	MemHash64  = memhash64
+	EfaceHash  = efaceHash
+	IfaceHash  = ifaceHash
+)
+
+var UseAeshash = &useAeshash
 
 func MemclrBytes(b []byte) {
 	s := (*slice)(unsafe.Pointer(&b))
@@ -364,3 +371,27 @@ func (rw *RWMutex) Lock() {
 func (rw *RWMutex) Unlock() {
 	rw.rw.unlock()
 }
+
+func MapBucketsCount(m map[int]int) int {
+	h := *(**hmap)(unsafe.Pointer(&m))
+	return 1 << h.B
+}
+
+func MapBucketsPointerIsNil(m map[int]int) bool {
+	h := *(**hmap)(unsafe.Pointer(&m))
+	return h.buckets == nil
+}
+
+func LockOSCounts() (external, internal uint32) {
+	g := getg()
+	if g.m.lockedExt+g.m.lockedInt == 0 {
+		if g.lockedm != 0 {
+			panic("lockedm on non-locked goroutine")
+		}
+	} else {
+		if g.lockedm == 0 {
+			panic("nil lockedm on locked goroutine")
+		}
+	}
+	return g.m.lockedExt, g.m.lockedInt
+}
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 6ca978980f2..36787e38b02 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -184,8 +184,8 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool)
 // program counter adjustment.
 func Callers(skip int, pc []uintptr) int
 
-// GOROOT returns the root of the Go tree.
-// It uses the GOROOT environment variable, if set,
+// GOROOT returns the root of the Go tree. It uses the
+// GOROOT environment variable, if set at process start,
 // or else the root used during the Go build.
 func GOROOT() string {
 	s := gogetenv("GOROOT")
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index f14e0d5050e..a8c52d206f3 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -10,12 +10,14 @@ import (
 	"reflect"
 	"runtime"
 	"runtime/debug"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
 )
 
 func TestGcSys(t *testing.T) {
+	t.Skip("does not test anything; https://golang.org/issue/23343")
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
@@ -171,7 +173,7 @@ func TestPeriodicGC(t *testing.T) {
 	// slack if things are slow.
 	var numGCs uint32
 	const want = 2
-	for i := 0; i < 20 && numGCs < want; i++ {
+	for i := 0; i < 200 && numGCs < want; i++ {
 		time.Sleep(5 * time.Millisecond)
 
 		// Test that periodic GC actually happened.
@@ -501,3 +503,142 @@ func BenchmarkReadMemStats(b *testing.B) {
 
 	hugeSink = nil
 }
+
+func TestUserForcedGC(t *testing.T) {
+	// Test that runtime.GC() triggers a GC even if GOGC=off.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+	var ms1, ms2 runtime.MemStats
+	runtime.ReadMemStats(&ms1)
+	runtime.GC()
+	runtime.ReadMemStats(&ms2)
+	if ms1.NumGC == ms2.NumGC {
+		t.Fatalf("runtime.GC() did not trigger GC")
+	}
+	if ms1.NumForcedGC == ms2.NumForcedGC {
+		t.Fatalf("runtime.GC() was not accounted in NumForcedGC")
+	}
+}
+
+func writeBarrierBenchmark(b *testing.B, f func()) {
+	runtime.GC()
+	var ms runtime.MemStats
+	runtime.ReadMemStats(&ms)
+	//b.Logf("heap size: %d MB", ms.HeapAlloc>>20)
+
+	// Keep GC running continuously during the benchmark, which in
+	// turn keeps the write barrier on continuously.
+	var stop uint32
+	done := make(chan bool)
+	go func() {
+		for atomic.LoadUint32(&stop) == 0 {
+			runtime.GC()
+		}
+		close(done)
+	}()
+	defer func() {
+		atomic.StoreUint32(&stop, 1)
+		<-done
+	}()
+
+	b.ResetTimer()
+	f()
+	b.StopTimer()
+}
+
+func BenchmarkWriteBarrier(b *testing.B) {
+	if runtime.GOMAXPROCS(-1) < 2 {
+		// We don't want GC to take our time.
+		b.Skip("need GOMAXPROCS >= 2")
+	}
+
+	// Construct a large tree both so the GC runs for a while and
+	// so we have a data structure to manipulate the pointers of.
+	type node struct {
+		l, r *node
+	}
+	var wbRoots []*node
+	var mkTree func(level int) *node
+	mkTree = func(level int) *node {
+		if level == 0 {
+			return nil
+		}
+		n := &node{mkTree(level - 1), mkTree(level - 1)}
+		if level == 10 {
+			// Seed GC with enough early pointers so it
+			// doesn't accidentally switch to mark 2 when
+			// it only has the top of the tree.
+			wbRoots = append(wbRoots, n)
+		}
+		return n
+	}
+	const depth = 22 // 64 MB
+	root := mkTree(22)
+
+	writeBarrierBenchmark(b, func() {
+		var stack [depth]*node
+		tos := -1
+
+		// There are two write barriers per iteration, so i+=2.
+		for i := 0; i < b.N; i += 2 {
+			if tos == -1 {
+				stack[0] = root
+				tos = 0
+			}
+
+			// Perform one step of reversing the tree.
+			n := stack[tos]
+			if n.l == nil {
+				tos--
+			} else {
+				n.l, n.r = n.r, n.l
+				stack[tos] = n.l
+				stack[tos+1] = n.r
+				tos++
+			}
+
+			if i%(1<<12) == 0 {
+				// Avoid non-preemptible loops (see issue #10958).
+				runtime.Gosched()
+			}
+		}
+	})
+
+	runtime.KeepAlive(wbRoots)
+}
+
+func BenchmarkBulkWriteBarrier(b *testing.B) {
+	if runtime.GOMAXPROCS(-1) < 2 {
+		// We don't want GC to take our time.
+		b.Skip("need GOMAXPROCS >= 2")
+	}
+
+	// Construct a large set of objects we can copy around.
+	const heapSize = 64 << 20
+	type obj [16]*byte
+	ptrs := make([]*obj, heapSize/unsafe.Sizeof(obj{}))
+	for i := range ptrs {
+		ptrs[i] = new(obj)
+	}
+
+	writeBarrierBenchmark(b, func() {
+		const blockSize = 1024
+		var pos int
+		for i := 0; i < b.N; i += blockSize {
+			// Rotate block.
+			block := ptrs[pos : pos+blockSize]
+			first := block[0]
+			copy(block, block[1:])
+			block[blockSize-1] = first
+
+			pos += blockSize
+			if pos+blockSize > len(ptrs) {
+				pos = 0
+			}
+
+			runtime.Gosched()
+		}
+	})
+
+	runtime.KeepAlive(ptrs)
+}
diff --git a/libgo/go/runtime/hash32.go b/libgo/go/runtime/hash32.go
index dd2e657fe3f..401fe2857d9 100644
--- a/libgo/go/runtime/hash32.go
+++ b/libgo/go/runtime/hash32.go
@@ -86,6 +86,32 @@ tail:
 	return uintptr(h)
 }
 
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint32(seed + 4*hashkey[0])
+	h ^= readUnaligned32(p)
+	h = rotl_15(h*m1) * m2
+	h ^= h >> 17
+	h *= m3
+	h ^= h >> 13
+	h *= m4
+	h ^= h >> 16
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint32(seed + 8*hashkey[0])
+	h ^= readUnaligned32(p)
+	h = rotl_15(h*m1) * m2
+	h ^= readUnaligned32(add(p, 4))
+	h = rotl_15(h*m1) * m2
+	h ^= h >> 17
+	h *= m3
+	h ^= h >> 13
+	h *= m4
+	h ^= h >> 16
+	return uintptr(h)
+}
+
 // Note: in order to get the compiler to issue rotl instructions, we
 // need to constant fold the shift amount by hand.
 // TODO: convince the compiler to issue rotl instructions after inlining.
diff --git a/libgo/go/runtime/hash64.go b/libgo/go/runtime/hash64.go
index f7d4a6f2f2a..5912943a4e9 100644
--- a/libgo/go/runtime/hash64.go
+++ b/libgo/go/runtime/hash64.go
@@ -86,6 +86,28 @@ tail:
 	return uintptr(h)
 }
 
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 4*hashkey[0])
+	v := uint64(readUnaligned32(p))
+	h ^= v
+	h ^= v << 32
+	h = rotl_31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 8*hashkey[0])
+	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
+	h = rotl_31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
 // Note: in order to get the compiler to issue rotl instructions, we
 // need to constant fold the shift amount by hand.
 // TODO: convince the compiler to issue rotl instructions after inlining.
diff --git a/libgo/go/runtime/hash_test.go b/libgo/go/runtime/hash_test.go
index 167c49eb5f5..54c91609f60 100644
--- a/libgo/go/runtime/hash_test.go
+++ b/libgo/go/runtime/hash_test.go
@@ -14,6 +14,40 @@ import (
 	"unsafe"
 )
 
+func TestMemHash32Equality(t *testing.T) {
+	if *UseAeshash {
+		t.Skip("skipping since AES hash implementation is used")
+	}
+	var b [4]byte
+	r := rand.New(rand.NewSource(1234))
+	seed := uintptr(r.Uint64())
+	for i := 0; i < 100; i++ {
+		randBytes(r, b[:])
+		got := MemHash32(unsafe.Pointer(&b), seed)
+		want := MemHash(unsafe.Pointer(&b), seed, 4)
+		if got != want {
+			t.Errorf("MemHash32(%x, %v) = %v; want %v", b, seed, got, want)
+		}
+	}
+}
+
+func TestMemHash64Equality(t *testing.T) {
+	if *UseAeshash {
+		t.Skip("skipping since AES hash implementation is used")
+	}
+	var b [8]byte
+	r := rand.New(rand.NewSource(1234))
+	seed := uintptr(r.Uint64())
+	for i := 0; i < 100; i++ {
+		randBytes(r, b[:])
+		got := MemHash64(unsafe.Pointer(&b), seed)
+		want := MemHash(unsafe.Pointer(&b), seed, 8)
+		if got != want {
+			t.Errorf("MemHash64(%x, %v) = %v; want %v", b, seed, got, want)
+		}
+	}
+}
+
 // Smhasher is a torture test for hash functions.
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
diff --git a/libgo/go/runtime/hashmap.go b/libgo/go/runtime/hashmap.go
index a3e50cd9221..a1fe49e9305 100644
--- a/libgo/go/runtime/hashmap.go
+++ b/libgo/go/runtime/hashmap.go
@@ -63,6 +63,8 @@ import (
 // themselves, so that the compiler will export them.
 //
 //go:linkname makemap runtime.makemap
+//go:linkname makemap64 runtime.makemap64
+//go:linkname makemap_small runtime.makemap_small
 //go:linkname mapaccess1 runtime.mapaccess1
 //go:linkname mapaccess2 runtime.mapaccess2
 //go:linkname mapaccess1_fat runtime.mapaccess1_fat
@@ -77,8 +79,10 @@ const (
 	bucketCntBits = 3
 	bucketCnt     = 1 << bucketCntBits
 
-	// Maximum average load of a bucket that triggers growth.
-	loadFactor = 6.5
+	// Maximum average load of a bucket that triggers growth is 6.5.
+	// Represent as loadFactorNum/loadFactDen, to allow integer math.
+	loadFactorNum = 13
+	loadFactorDen = 2
 
 	// Maximum key or value size to keep inline (instead of mallocing per element).
 	// Must fit in a uint8.
@@ -137,12 +141,13 @@ type mapextra struct {
 	// If both key and value do not contain pointers and are inline, then we mark bucket
 	// type as containing no pointers. This avoids scanning such maps.
 	// However, bmap.overflow is a pointer. In order to keep overflow buckets
-	// alive, we store pointers to all overflow buckets in hmap.overflow.
-	// Overflow is used only if key and value do not contain pointers.
-	// overflow[0] contains overflow buckets for hmap.buckets.
-	// overflow[1] contains overflow buckets for hmap.oldbuckets.
+	// alive, we store pointers to all overflow buckets in hmap.overflow and h.map.oldoverflow.
+	// overflow and oldoverflow are only used if key and value do not contain pointers.
+	// overflow contains overflow buckets for hmap.buckets.
+	// oldoverflow contains overflow buckets for hmap.oldbuckets.
 	// The indirection allows to store a pointer to the slice in hiter.
-	overflow [2]*[]*bmap
+	overflow    *[]*bmap
+	oldoverflow *[]*bmap
 
 	// nextOverflow holds a pointer to a free overflow bucket.
 	nextOverflow *bmap
@@ -171,7 +176,8 @@ type hiter struct {
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
 	bptr        *bmap          // current bucket
-	overflow    [2]*[]*bmap    // keeps overflow buckets alive
+	overflow    *[]*bmap       // keeps overflow buckets of hmap.buckets alive
+	oldoverflow *[]*bmap       // keeps overflow buckets of hmap.oldbuckets alive
 	startBucket uintptr        // bucket iteration started at
 	offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
 	wrapped     bool           // already wrapped around from end of bucket array to beginning
@@ -181,6 +187,28 @@ type hiter struct {
 	checkBucket uintptr
 }
 
+// bucketShift returns 1<<b, optimized for code generation.
+func bucketShift(b uint8) uintptr {
+	if sys.GoarchAmd64|sys.GoarchAmd64p32|sys.Goarch386 != 0 {
+		b &= sys.PtrSize*8 - 1 // help x86 archs remove shift overflow checks
+	}
+	return uintptr(1) << b
+}
+
+// bucketMask returns 1<<b - 1, optimized for code generation.
+func bucketMask(b uint8) uintptr {
+	return bucketShift(b) - 1
+}
+
+// tophash calculates the tophash value for hash.
+func tophash(hash uintptr) uint8 {
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	return top
+}
+
 func evacuated(b *bmap) bool {
 	h := b.tophash[0]
 	return h > empty && h < minTopHash
@@ -194,6 +222,10 @@ func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
 	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
 }
 
+func (b *bmap) keys() unsafe.Pointer {
+	return add(unsafe.Pointer(b), dataOffset)
+}
+
 // incrnoverflow increments h.noverflow.
 // noverflow counts the number of overflow buckets.
 // This is used to trigger same-size map growth.
@@ -242,7 +274,7 @@ func (h *hmap) newoverflow(t *maptype, b *bmap) *bmap {
 	h.incrnoverflow()
 	if t.bucket.kind&kindNoPointers != 0 {
 		h.createOverflow()
-		*h.extra.overflow[0] = append(*h.extra.overflow[0], ovf)
+		*h.extra.overflow = append(*h.extra.overflow, ovf)
 	}
 	b.setoverflow(t, ovf)
 	return ovf
@@ -252,97 +284,69 @@ func (h *hmap) createOverflow() {
 	if h.extra == nil {
 		h.extra = new(mapextra)
 	}
-	if h.extra.overflow[0] == nil {
-		h.extra.overflow[0] = new([]*bmap)
+	if h.extra.overflow == nil {
+		h.extra.overflow = new([]*bmap)
 	}
 }
 
-// makemap implements a Go map creation make(map[k]v, hint)
+func makemap64(t *maptype, hint int64, h *hmap) *hmap {
+	if int64(int(hint)) != hint {
+		hint = 0
+	}
+	return makemap(t, int(hint), h)
+}
+
+// makehmap_small implements Go map creation for make(map[k]v) and
+// make(map[k]v, hint) when hint is known to be at most bucketCnt
+// at compile time and the map needs to be allocated on the heap.
+func makemap_small() *hmap {
+	h := new(hmap)
+	h.hash0 = fastrand()
+	return h
+}
+
+// makemap implements Go map creation for make(map[k]v, hint).
 // If the compiler has determined that the map or the first bucket
 // can be created on the stack, h and/or bucket may be non-nil.
 // If h != nil, the map can be created directly in h.
-// If bucket != nil, bucket can be used as the first bucket.
-func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
-	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != t.hmap.size {
+// If h.buckets != nil, bucket pointed to can be used as the first bucket.
+func makemap(t *maptype, hint int, h *hmap) *hmap {
+	// The size of hmap should be 48 bytes on 64 bit
+	// and 28 bytes on 32 bit platforms.
+	if sz := unsafe.Sizeof(hmap{}); sz != 8+5*sys.PtrSize {
 		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
 		throw("bad hmap size")
 	}
 
-	if hint < 0 || hint > int64(maxSliceCap(t.bucket.size)) {
+	if hint < 0 || hint > int(maxSliceCap(t.bucket.size)) {
 		hint = 0
 	}
 
-	if !ismapkey(t.key) {
-		throw("runtime.makemap: unsupported map key type")
-	}
-
-	// check compiler's and reflect's math
-	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
-		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
-		throw("key size wrong")
-	}
-	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
-		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
-		throw("value size wrong")
-	}
-
-	// invariants we depend on. We should probably check these at compile time
-	// somewhere, but for now we'll do it here.
-	if t.key.align > bucketCnt {
-		throw("key align too big")
-	}
-	if t.elem.align > bucketCnt {
-		throw("value align too big")
-	}
-	if t.key.size%uintptr(t.key.align) != 0 {
-		throw("key size not a multiple of key align")
-	}
-	if t.elem.size%uintptr(t.elem.align) != 0 {
-		throw("value size not a multiple of value align")
-	}
-	if bucketCnt < 8 {
-		throw("bucketsize too small for proper alignment")
-	}
-	if dataOffset%uintptr(t.key.align) != 0 {
-		throw("need padding in bucket (key)")
-	}
-	if dataOffset%uintptr(t.elem.align) != 0 {
-		throw("need padding in bucket (value)")
+	// initialize Hmap
+	if h == nil {
+		h = (*hmap)(newobject(t.hmap))
 	}
+	h.hash0 = fastrand()
 
 	// find size parameter which will hold the requested # of elements
 	B := uint8(0)
-	for ; overLoadFactor(hint, B); B++ {
+	for overLoadFactor(hint, B) {
+		B++
 	}
+	h.B = B
 
 	// allocate initial hash table
 	// if B == 0, the buckets field is allocated lazily later (in mapassign)
 	// If hint is large zeroing this memory could take a while.
-	buckets := bucket
-	var extra *mapextra
-	if B != 0 {
+	if h.B != 0 {
 		var nextOverflow *bmap
-		buckets, nextOverflow = makeBucketArray(t, B)
+		h.buckets, nextOverflow = makeBucketArray(t, h.B)
 		if nextOverflow != nil {
-			extra = new(mapextra)
-			extra.nextOverflow = nextOverflow
+			h.extra = new(mapextra)
+			h.extra.nextOverflow = nextOverflow
 		}
 	}
 
-	// initialize Hmap
-	if h == nil {
-		h = (*hmap)(newobject(t.hmap))
-	}
-	h.count = 0
-	h.B = B
-	h.extra = extra
-	h.flags = 0
-	h.hash0 = fastrand()
-	h.buckets = buckets
-	h.oldbuckets = nil
-	h.nevacuate = 0
-	h.noverflow = 0
-
 	return h
 }
 
@@ -353,7 +357,7 @@ func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
 // hold onto it for very long.
 func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapaccess1)
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -370,7 +374,7 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -382,11 +386,8 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -403,16 +404,13 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 				return v
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapaccess2)
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -429,7 +427,7 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -441,11 +439,8 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -462,11 +457,8 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 				return v, true
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 // returns both key and value. Used by map iterator
@@ -477,7 +469,7 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 	hashfn := t.key.hashfn
 	equalfn := t.key.equalfn
 	hash := hashfn(key, uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -489,11 +481,8 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -510,11 +499,8 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 				return k, v
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return nil, nil
-		}
 	}
+	return nil, nil
 }
 
 func mapaccess1_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) unsafe.Pointer {
@@ -539,7 +525,7 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapassign)
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -559,19 +545,16 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
+	top := tophash(hash)
 
 	var inserti *uint8
 	var insertk unsafe.Pointer
@@ -611,7 +594,7 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
@@ -651,7 +634,7 @@ done:
 
 func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		pc := funcPC(mapdelete)
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
@@ -674,16 +657,14 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 	// in which case we have not actually done a write (delete).
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
 				continue
@@ -696,53 +677,58 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 			if !equalfn(key, k2) {
 				continue
 			}
+			// Only clear key if there are pointers in it.
 			if t.indirectkey {
 				*(*unsafe.Pointer)(k) = nil
-			} else {
-				typedmemclr(t.key, k)
+			} else if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
 			}
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*uintptr(t.keysize) + i*uintptr(t.valuesize))
-			if t.indirectvalue {
-				*(*unsafe.Pointer)(v) = nil
-			} else {
-				typedmemclr(t.elem, v)
+			// Only clear value if there are pointers in it.
+			if t.indirectvalue || t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if t.indirectvalue {
+					*(*unsafe.Pointer)(v) = nil
+				} else {
+					memclrHasPointers(v, t.elem.size)
+				}
 			}
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
 	h.flags &^= hashWriting
 }
 
+// mapiterinit initializes the hiter struct used for ranging over maps.
+// The hiter struct pointed to by 'it' is allocated on the stack
+// by the compilers order pass or on the heap by reflect_mapiterinit.
+// Both need to have zeroed hiter since the struct contains pointers.
+// Gccgo-specific: *it need not be zeroed by the compiler,
+//  and it's cheaper to zero it here.
 func mapiterinit(t *maptype, h *hmap, it *hiter) {
-	// Clear pointer fields so garbage collector does not complain.
 	it.key = nil
 	it.value = nil
 	it.t = nil
 	it.h = nil
 	it.buckets = nil
 	it.bptr = nil
-	it.overflow[0] = nil
-	it.overflow[1] = nil
+	it.overflow = nil
+	it.oldoverflow = nil
+	it.wrapped = false
+	it.i = 0
+	it.checkBucket = 0
 
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiterinit))
 	}
 
 	if h == nil || h.count == 0 {
-		it.key = nil
-		it.value = nil
 		return
 	}
 
@@ -762,6 +748,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 		// while we are iterating.
 		h.createOverflow()
 		it.overflow = h.extra.overflow
+		it.oldoverflow = h.extra.oldoverflow
 	}
 
 	// decide where to start
@@ -769,16 +756,14 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 	if h.B > 31-bucketCntBits {
 		r += uintptr(fastrand()) << 31
 	}
-	it.startBucket = r & (uintptr(1)<<h.B - 1)
+	it.startBucket = r & bucketMask(h.B)
 	it.offset = uint8(r >> h.B & (bucketCnt - 1))
 
 	// iterator state
 	it.bucket = it.startBucket
-	it.wrapped = false
-	it.bptr = nil
 
 	// Remember we have an iterator.
-	// Can run concurrently with another hash_iter_init().
+	// Can run concurrently with another mapiterinit().
 	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
 		atomic.Or8(&h.flags, iterator|oldIterator)
 	}
@@ -789,7 +774,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 func mapiternext(it *hiter) {
 	h := it.h
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer( /* &it */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiternext))
 	}
 	if h.flags&hashWriting != 0 {
@@ -829,7 +814,7 @@ next:
 			checkBucket = noCheck
 		}
 		bucket++
-		if bucket == uintptr(1)<<it.B {
+		if bucket == bucketShift(it.B) {
 			bucket = 0
 			it.wrapped = true
 		}
@@ -837,90 +822,75 @@ next:
 	}
 	for ; i < bucketCnt; i++ {
 		offi := (i + it.offset) & (bucketCnt - 1)
+		if b.tophash[offi] == empty || b.tophash[offi] == evacuatedEmpty {
+			continue
+		}
 		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
+		if t.indirectkey {
+			k = *((*unsafe.Pointer)(k))
+		}
 		v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.valuesize))
-		if b.tophash[offi] != empty && b.tophash[offi] != evacuatedEmpty {
-			if checkBucket != noCheck && !h.sameSizeGrow() {
-				// Special case: iterator was started during a grow to a larger size
-				// and the grow is not done yet. We're working on a bucket whose
-				// oldbucket has not been evacuated yet. Or at least, it wasn't
-				// evacuated when we started the bucket. So we're iterating
-				// through the oldbucket, skipping any keys that will go
-				// to the other new bucket (each oldbucket expands to two
-				// buckets during a grow).
-				k2 := k
-				if t.indirectkey {
-					k2 = *((*unsafe.Pointer)(k2))
-				}
-				if t.reflexivekey || equalfn(k2, k2) {
-					// If the item in the oldbucket is not destined for
-					// the current new bucket in the iteration, skip it.
-					hash := hashfn(k2, uintptr(h.hash0))
-					if hash&(uintptr(1)<<it.B-1) != checkBucket {
-						continue
-					}
-				} else {
-					// Hash isn't repeatable if k != k (NaNs).  We need a
-					// repeatable and randomish choice of which direction
-					// to send NaNs during evacuation. We'll use the low
-					// bit of tophash to decide which way NaNs go.
-					// NOTE: this case is why we need two evacuate tophash
-					// values, evacuatedX and evacuatedY, that differ in
-					// their low bit.
-					if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
-						continue
-					}
-				}
-			}
-			if b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY {
-				// this is the golden data, we can return it.
-				if t.indirectkey {
-					k = *((*unsafe.Pointer)(k))
-				}
-				it.key = k
-				if t.indirectvalue {
-					v = *((*unsafe.Pointer)(v))
+		if checkBucket != noCheck && !h.sameSizeGrow() {
+			// Special case: iterator was started during a grow to a larger size
+			// and the grow is not done yet. We're working on a bucket whose
+			// oldbucket has not been evacuated yet. Or at least, it wasn't
+			// evacuated when we started the bucket. So we're iterating
+			// through the oldbucket, skipping any keys that will go
+			// to the other new bucket (each oldbucket expands to two
+			// buckets during a grow).
+			if t.reflexivekey || equalfn(k, k) {
+				// If the item in the oldbucket is not destined for
+				// the current new bucket in the iteration, skip it.
+				hash := hashfn(k, uintptr(h.hash0))
+				if hash&bucketMask(it.B) != checkBucket {
+					continue
 				}
-				it.value = v
 			} else {
-				// The hash table has grown since the iterator was started.
-				// The golden data for this key is now somewhere else.
-				k2 := k
-				if t.indirectkey {
-					k2 = *((*unsafe.Pointer)(k2))
-				}
-				if t.reflexivekey || equalfn(k2, k2) {
-					// Check the current hash table for the data.
-					// This code handles the case where the key
-					// has been deleted, updated, or deleted and reinserted.
-					// NOTE: we need to regrab the key as it has potentially been
-					// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
-					rk, rv := mapaccessK(t, h, k2)
-					if rk == nil {
-						continue // key has been deleted
-					}
-					it.key = rk
-					it.value = rv
-				} else {
-					// if key!=key then the entry can't be deleted or
-					// updated, so we can just return it. That's lucky for
-					// us because when key!=key we can't look it up
-					// successfully in the current table.
-					it.key = k2
-					if t.indirectvalue {
-						v = *((*unsafe.Pointer)(v))
-					}
-					it.value = v
+				// Hash isn't repeatable if k != k (NaNs).  We need a
+				// repeatable and randomish choice of which direction
+				// to send NaNs during evacuation. We'll use the low
+				// bit of tophash to decide which way NaNs go.
+				// NOTE: this case is why we need two evacuate tophash
+				// values, evacuatedX and evacuatedY, that differ in
+				// their low bit.
+				if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
+					continue
 				}
 			}
-			it.bucket = bucket
-			if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
-				it.bptr = b
+		}
+		if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
+			!(t.reflexivekey || equalfn(k, k)) {
+			// This is the golden data, we can return it.
+			// OR
+			// key!=key, so the entry can't be deleted or updated, so we can just return it.
+			// That's lucky for us because when key!=key we can't look it up successfully.
+			it.key = k
+			if t.indirectvalue {
+				v = *((*unsafe.Pointer)(v))
 			}
-			it.i = i + 1
-			it.checkBucket = checkBucket
-			return
+			it.value = v
+		} else {
+			// The hash table has grown since the iterator was started.
+			// The golden data for this key is now somewhere else.
+			// Check the current hash table for the data.
+			// This code handles the case where the key
+			// has been deleted, updated, or deleted and reinserted.
+			// NOTE: we need to regrab the key as it has potentially been
+			// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
+			rk, rv := mapaccessK(t, h, k)
+			if rk == nil {
+				continue // key has been deleted
+			}
+			it.key = rk
+			it.value = rv
 		}
+		it.bucket = bucket
+		if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
+			it.bptr = b
+		}
+		it.i = i + 1
+		it.checkBucket = checkBucket
+		return
 	}
 	b = b.overflow(t)
 	i = 0
@@ -928,7 +898,7 @@ next:
 }
 
 func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow *bmap) {
-	base := uintptr(1 << b)
+	base := bucketShift(b)
 	nbuckets := base
 	// For small b, overflow buckets are unlikely.
 	// Avoid the overhead of the calculation.
@@ -936,7 +906,7 @@ func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow
 		// Add on the estimated number of overflow buckets
 		// required to insert the median number of elements
 		// used with this value of b.
-		nbuckets += 1 << (b - 4)
+		nbuckets += bucketShift(b - 4)
 		sz := t.bucket.size * nbuckets
 		up := roundupsize(sz)
 		if up != sz {
@@ -962,7 +932,7 @@ func hashGrow(t *maptype, h *hmap) {
 	// Otherwise, there are too many overflow buckets,
 	// so keep the same number of buckets and "grow" laterally.
 	bigger := uint8(1)
-	if !overLoadFactor(int64(h.count), h.B) {
+	if !overLoadFactor(h.count+1, h.B) {
 		bigger = 0
 		h.flags |= sameSizeGrow
 	}
@@ -981,13 +951,13 @@ func hashGrow(t *maptype, h *hmap) {
 	h.nevacuate = 0
 	h.noverflow = 0
 
-	if h.extra != nil && h.extra.overflow[0] != nil {
+	if h.extra != nil && h.extra.overflow != nil {
 		// Promote current overflow buckets to the old generation.
-		if h.extra.overflow[1] != nil {
-			throw("overflow is not nil")
+		if h.extra.oldoverflow != nil {
+			throw("oldoverflow is not nil")
 		}
-		h.extra.overflow[1] = h.extra.overflow[0]
-		h.extra.overflow[0] = nil
+		h.extra.oldoverflow = h.extra.overflow
+		h.extra.overflow = nil
 	}
 	if nextOverflow != nil {
 		if h.extra == nil {
@@ -1001,9 +971,8 @@ func hashGrow(t *maptype, h *hmap) {
 }
 
 // overLoadFactor reports whether count items placed in 1<<B buckets is over loadFactor.
-func overLoadFactor(count int64, B uint8) bool {
-	// TODO: rewrite to use integer math and comparison?
-	return count >= bucketCnt && float32(count) >= loadFactor*float32((uint64(1)<<B))
+func overLoadFactor(count int, B uint8) bool {
+	return count > bucketCnt && uintptr(count) > loadFactorNum*(bucketShift(B)/loadFactorDen)
 }
 
 // tooManyOverflowBuckets reports whether noverflow buckets is too many for a map with 1<<B buckets.
@@ -1014,10 +983,11 @@ func tooManyOverflowBuckets(noverflow uint16, B uint8) bool {
 	// If the threshold is too high, maps that grow and shrink can hold on to lots of unused memory.
 	// "too many" means (approximately) as many overflow buckets as regular buckets.
 	// See incrnoverflow for more details.
-	if B < 16 {
-		return noverflow >= uint16(1)<<B
+	if B > 15 {
+		B = 15
 	}
-	return noverflow >= 1<<15
+	// The compiler doesn't see here that B < 16; mask B to generate shorter shift code.
+	return noverflow >= uint16(1)<<(B&15)
 }
 
 // growing reports whether h is growing. The growth may be to the same size or bigger.
@@ -1036,7 +1006,7 @@ func (h *hmap) noldbuckets() uintptr {
 	if !h.sameSizeGrow() {
 		oldB--
 	}
-	return uintptr(1) << oldB
+	return bucketShift(oldB)
 }
 
 // oldbucketmask provides a mask that can be applied to calculate n % noldbuckets().
@@ -1060,33 +1030,38 @@ func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
 	return evacuated(b)
 }
 
+// evacDst is an evacuation destination.
+type evacDst struct {
+	b *bmap          // current destination bucket
+	i int            // key/val index into b
+	k unsafe.Pointer // pointer to current key storage
+	v unsafe.Pointer // pointer to current value storage
+}
+
 func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
 	newbit := h.noldbuckets()
 	hashfn := t.key.hashfn
-	equalfn := t.key.equalfn
 	if !evacuated(b) {
 		// TODO: reuse overflow buckets instead of using new ones, if there
 		// is no iterator using the old buckets.  (If !oldIterator.)
 
-		var (
-			x, y   *bmap          // current low/high buckets in new map
-			xi, yi int            // key/val indices into x and y
-			xk, yk unsafe.Pointer // pointers to current x and y key storage
-			xv, yv unsafe.Pointer // pointers to current x and y value storage
-		)
-		x = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
-		xi = 0
-		xk = add(unsafe.Pointer(x), dataOffset)
-		xv = add(xk, bucketCnt*uintptr(t.keysize))
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*uintptr(t.keysize))
+
 		if !h.sameSizeGrow() {
 			// Only calculate y pointers if we're growing bigger.
 			// Otherwise GC can see bad pointers.
-			y = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
-			yi = 0
-			yk = add(unsafe.Pointer(y), dataOffset)
-			yv = add(yk, bucketCnt*uintptr(t.keysize))
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*uintptr(t.keysize))
 		}
+
 		for ; b != nil; b = b.overflow(t) {
 			k := add(unsafe.Pointer(b), dataOffset)
 			v := add(k, bucketCnt*uintptr(t.keysize))
@@ -1103,122 +1078,102 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 				if t.indirectkey {
 					k2 = *((*unsafe.Pointer)(k2))
 				}
-				useX := true
+				var useY uint8
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/value to bucket x or bucket y).
 					hash := hashfn(k2, uintptr(h.hash0))
-					if h.flags&iterator != 0 {
-						if !t.reflexivekey && !equalfn(k2, k2) {
-							// If key != key (NaNs), then the hash could be (and probably
-							// will be) entirely different from the old hash. Moreover,
-							// it isn't reproducible. Reproducibility is required in the
-							// presence of iterators, as our evacuation decision must
-							// match whatever decision the iterator made.
-							// Fortunately, we have the freedom to send these keys either
-							// way. Also, tophash is meaningless for these kinds of keys.
-							// We let the low bit of tophash drive the evacuation decision.
-							// We recompute a new random tophash for the next level so
-							// these keys will get evenly distributed across all buckets
-							// after multiple grows.
-							if top&1 != 0 {
-								hash |= newbit
-							} else {
-								hash &^= newbit
-							}
-							top = uint8(hash >> (sys.PtrSize*8 - 8))
-							if top < minTopHash {
-								top += minTopHash
-							}
+					if h.flags&iterator != 0 && !t.reflexivekey && !t.key.equalfn(k2, k2) {
+						// If key != key (NaNs), then the hash could be (and probably
+						// will be) entirely different from the old hash. Moreover,
+						// it isn't reproducible. Reproducibility is required in the
+						// presence of iterators, as our evacuation decision must
+						// match whatever decision the iterator made.
+						// Fortunately, we have the freedom to send these keys either
+						// way. Also, tophash is meaningless for these kinds of keys.
+						// We let the low bit of tophash drive the evacuation decision.
+						// We recompute a new random tophash for the next level so
+						// these keys will get evenly distributed across all buckets
+						// after multiple grows.
+						useY = top & 1
+						top = tophash(hash)
+					} else {
+						if hash&newbit != 0 {
+							useY = 1
 						}
 					}
-					useX = hash&newbit == 0
 				}
-				if useX {
-					b.tophash[i] = evacuatedX
-					if xi == bucketCnt {
-						newx := h.newoverflow(t, x)
-						x = newx
-						xi = 0
-						xk = add(unsafe.Pointer(x), dataOffset)
-						xv = add(xk, bucketCnt*uintptr(t.keysize))
-					}
-					x.tophash[xi] = top
-					if t.indirectkey {
-						*(*unsafe.Pointer)(xk) = k2 // copy pointer
-					} else {
-						typedmemmove(t.key, xk, k) // copy value
-					}
-					if t.indirectvalue {
-						*(*unsafe.Pointer)(xv) = *(*unsafe.Pointer)(v)
-					} else {
-						typedmemmove(t.elem, xv, v)
-					}
-					xi++
-					xk = add(xk, uintptr(t.keysize))
-					xv = add(xv, uintptr(t.valuesize))
+
+				if evacuatedX+1 != evacuatedY {
+					throw("bad evacuatedN")
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*uintptr(t.keysize))
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+				if t.indirectkey {
+					*(*unsafe.Pointer)(dst.k) = k2 // copy pointer
 				} else {
-					b.tophash[i] = evacuatedY
-					if yi == bucketCnt {
-						newy := h.newoverflow(t, y)
-						y = newy
-						yi = 0
-						yk = add(unsafe.Pointer(y), dataOffset)
-						yv = add(yk, bucketCnt*uintptr(t.keysize))
-					}
-					y.tophash[yi] = top
-					if t.indirectkey {
-						*(*unsafe.Pointer)(yk) = k2
-					} else {
-						typedmemmove(t.key, yk, k)
-					}
-					if t.indirectvalue {
-						*(*unsafe.Pointer)(yv) = *(*unsafe.Pointer)(v)
-					} else {
-						typedmemmove(t.elem, yv, v)
-					}
-					yi++
-					yk = add(yk, uintptr(t.keysize))
-					yv = add(yv, uintptr(t.valuesize))
+					typedmemmove(t.key, dst.k, k) // copy value
 				}
+				if t.indirectvalue {
+					*(*unsafe.Pointer)(dst.v) = *(*unsafe.Pointer)(v)
+				} else {
+					typedmemmove(t.elem, dst.v, v)
+				}
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, uintptr(t.keysize))
+				dst.v = add(dst.v, uintptr(t.valuesize))
 			}
 		}
 		// Unlink the overflow buckets & clear key/value to help GC.
-		if h.flags&oldIterator == 0 {
-			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
 			// Preserve b.tophash because the evacuation
 			// state is maintained there.
-			if t.bucket.kind&kindNoPointers == 0 {
-				memclrHasPointers(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
-			} else {
-				memclrNoHeapPointers(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
-			}
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
 		}
 	}
 
-	// Advance evacuation mark
 	if oldbucket == h.nevacuate {
-		h.nevacuate = oldbucket + 1
-		// Experiments suggest that 1024 is overkill by at least an order of magnitude.
-		// Put it in there as a safeguard anyway, to ensure O(1) behavior.
-		stop := h.nevacuate + 1024
-		if stop > newbit {
-			stop = newbit
-		}
-		for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
-			h.nevacuate++
-		}
-		if h.nevacuate == newbit { // newbit == # of oldbuckets
-			// Growing is all done. Free old main bucket array.
-			h.oldbuckets = nil
-			// Can discard old overflow buckets as well.
-			// If they are still referenced by an iterator,
-			// then the iterator holds a pointers to the slice.
-			if h.extra != nil {
-				h.extra.overflow[1] = nil
-			}
-			h.flags &^= sameSizeGrow
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func advanceEvacuationMark(h *hmap, t *maptype, newbit uintptr) {
+	h.nevacuate++
+	// Experiments suggest that 1024 is overkill by at least an order of magnitude.
+	// Put it in there as a safeguard anyway, to ensure O(1) behavior.
+	stop := h.nevacuate + 1024
+	if stop > newbit {
+		stop = newbit
+	}
+	for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
+		h.nevacuate++
+	}
+	if h.nevacuate == newbit { // newbit == # of oldbuckets
+		// Growing is all done. Free old main bucket array.
+		h.oldbuckets = nil
+		// Can discard old overflow buckets as well.
+		// If they are still referenced by an iterator,
+		// then the iterator holds a pointers to the slice.
+		if h.extra != nil {
+			h.extra.oldoverflow = nil
 		}
+		h.flags &^= sameSizeGrow
 	}
 }
 
@@ -1230,7 +1185,45 @@ func ismapkey(t *_type) bool {
 
 //go:linkname reflect_makemap reflect.makemap
 func reflect_makemap(t *maptype, cap int) *hmap {
-	return makemap(t, int64(cap), nil, nil)
+	// Check invariants and reflects math.
+	if sz := unsafe.Sizeof(hmap{}); sz != t.hmap.size {
+		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
+		throw("bad hmap size")
+	}
+	if !ismapkey(t.key) {
+		throw("runtime.reflect_makemap: unsupported map key type")
+	}
+	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
+		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
+		throw("key size wrong")
+	}
+	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
+		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
+		throw("value size wrong")
+	}
+	if t.key.align > bucketCnt {
+		throw("key align too big")
+	}
+	if t.elem.align > bucketCnt {
+		throw("value align too big")
+	}
+	if t.key.size%uintptr(t.key.align) != 0 {
+		throw("key size not a multiple of key align")
+	}
+	if t.elem.size%uintptr(t.elem.align) != 0 {
+		throw("value size not a multiple of value align")
+	}
+	if bucketCnt < 8 {
+		throw("bucketsize too small for proper alignment")
+	}
+	if dataOffset%uintptr(t.key.align) != 0 {
+		throw("need padding in bucket (key)")
+	}
+	if dataOffset%uintptr(t.elem.align) != 0 {
+		throw("need padding in bucket (value)")
+	}
+
+	return makemap(t, cap, nil)
 }
 
 //go:linkname reflect_mapaccess reflect.mapaccess
@@ -1277,7 +1270,7 @@ func reflect_maplen(h *hmap) int {
 		return 0
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer( /* &h */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(reflect_maplen))
 	}
 	return h.count
diff --git a/libgo/go/runtime/hashmap_fast.go b/libgo/go/runtime/hashmap_fast.go
index bec8fdac14e..e0fc9815131 100644
--- a/libgo/go/runtime/hashmap_fast.go
+++ b/libgo/go/runtime/hashmap_fast.go
@@ -11,7 +11,7 @@ import (
 
 func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -26,7 +26,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -39,28 +39,19 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
-			if k != key {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -75,7 +66,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -88,28 +79,19 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
-			if k != key {
-				continue
-			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if *(*uint32)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
 			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -124,7 +106,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -137,28 +119,19 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
-			if k != key {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -173,7 +146,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
-		m := uintptr(1)<<h.B - 1
+		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
 			if !h.sameSizeGrow() {
@@ -186,28 +159,19 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 			}
 		}
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
-			if k != key {
-				continue
-			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if *(*uint64)(k) == key && b.tophash[i] != empty {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
 			}
-			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
-		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
+	return unsafe.Pointer(&zeroVal[0]), false
 }
 
 func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -222,13 +186,9 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		b := (*bmap)(h.buckets)
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
-			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-				if x == empty {
-					continue
-				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-				if k.len != key.len {
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || b.tophash[i] == empty {
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -239,13 +199,9 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] == empty {
 				continue
 			}
 			if k.str == key.str {
@@ -275,7 +231,7 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 	}
 dohash:
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -287,34 +243,24 @@ dohash:
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x != top {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
 				continue
 			}
 			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0])
-		}
 	}
+	return unsafe.Pointer(&zeroVal[0])
 }
 
 func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer( /* &t */ nil))
+		callerpc := getcallerpc()
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -329,13 +275,9 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		b := (*bmap)(h.buckets)
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
-			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-				if x == empty {
-					continue
-				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-				if k.len != key.len {
+			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+				k := (*stringStruct)(kptr)
+				if k.len != key.len || b.tophash[i] == empty {
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -346,13 +288,9 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
-		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x == empty {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] == empty {
 				continue
 			}
 			if k.str == key.str {
@@ -382,7 +320,7 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 	}
 dohash:
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
-	m := uintptr(1)<<h.B - 1
+	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
@@ -394,37 +332,113 @@ dohash:
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+	top := tophash(hash)
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
+				continue
+			}
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
+			}
+		}
+	}
+	return unsafe.Pointer(&zeroVal[0]), false
+}
+
+func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_fast32(t, h, bucket)
 	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
-			if x != top {
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					inserti = i
+					insertb = b
+				}
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
 				continue
 			}
-			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
-			}
+			inserti = i
+			insertb = b
+			goto done
 		}
-		b = b.overflow(t)
-		if b == nil {
-			return unsafe.Pointer(&zeroVal[0]), false
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
 		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if insertb == nil {
+		// all current buckets are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*uint32)(insertk) = key
+
+	h.count++
+
+done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.valuesize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
 }
 
-func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+func mapassign_fast32ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h == nil {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
 	}
 	if h.flags&hashWriting != 0 {
@@ -436,38 +450,35 @@ func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast32(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
 
-	var inserti *uint8
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*4)
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					inserti = i
+					insertb = b
 				}
 				continue
 			}
-			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*4)))
 			if k != key {
 				continue
 			}
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			inserti = i
+			insertb = b
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -481,25 +492,26 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*4)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*4)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
 
-	// store new key/value at insert position
-	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*4+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -512,7 +524,7 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
 	}
 	if h.flags&hashWriting != 0 {
@@ -524,30 +536,26 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
 
-	var inserti *uint8
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*8)
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					insertb = b
+					inserti = i
 				}
 				continue
 			}
@@ -555,7 +563,8 @@ again:
 			if k != key {
 				continue
 			}
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			insertb = b
+			inserti = i
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -569,25 +578,26 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*8)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*uint64)(insertk) = key
 
-	// store new key/value at insert position
-	typedmemmove(t.key, insertk, unsafe.Pointer(&key))
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -595,48 +605,131 @@ done:
 	return val
 }
 
-func mapassign_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+func mapassign_fast64ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 	if h == nil {
 		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
-		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	key := stringStructOf(&ky)
-	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
 	h.flags |= hashWriting
 
 	if h.buckets == nil {
-		h.buckets = newarray(t.bucket, 1)
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
 	}
 
 again:
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+
+	var insertb *bmap
+	var inserti uintptr
+	var insertk unsafe.Pointer
+
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] == empty {
+				if insertb == nil {
+					insertb = b
+					inserti = i
+				}
+				continue
+			}
+			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			insertb = b
+			inserti = i
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
 	}
 
-	var inserti *uint8
+	if insertb == nil {
+		// all current buckets are full, allocate a new one.
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
+	}
+	insertb.tophash[inserti&(bucketCnt-1)] = tophash(hash) // mask inserti to avoid bounds checks
+
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*8)
+	// store new key at insert position
+	*(*unsafe.Pointer)(insertk) = key
+
+	h.count++
+
+done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*8+inserti*uintptr(t.valuesize))
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc()
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	key := stringStructOf(&s)
+	hash := t.key.hashfn(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & bucketMask(h.B)
+	if h.growing() {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+
+	var insertb *bmap
+	var inserti uintptr
 	var insertk unsafe.Pointer
-	var val unsafe.Pointer
+
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
-					inserti = &b.tophash[i]
-					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
-					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if b.tophash[i] == empty && insertb == nil {
+					insertb = b
+					inserti = i
 				}
 				continue
 			}
@@ -648,7 +741,8 @@ again:
 				continue
 			}
 			// already have a mapping for key. Update it.
-			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+			inserti = i
+			insertb = b
 			goto done
 		}
 		ovf := b.overflow(t)
@@ -662,25 +756,25 @@ again:
 
 	// If we hit the max load factor or we have too many overflow buckets,
 	// and we're not already in the middle of growing, start growing.
-	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
 	}
 
-	if inserti == nil {
+	if insertb == nil {
 		// all current buckets are full, allocate a new one.
-		newb := h.newoverflow(t, b)
-		inserti = &newb.tophash[0]
-		insertk = add(unsafe.Pointer(newb), dataOffset)
-		val = add(insertk, bucketCnt*2*sys.PtrSize)
+		insertb = h.newoverflow(t, b)
+		inserti = 0 // not necessary, but avoids needlessly spilling inserti
 	}
+	insertb.tophash[inserti&(bucketCnt-1)] = top // mask inserti to avoid bounds checks
 
-	// store new key/value at insert position
+	insertk = add(unsafe.Pointer(insertb), dataOffset+inserti*2*sys.PtrSize)
+	// store new key at insert position
 	*((*stringStruct)(insertk)) = *key
-	*inserti = top
 	h.count++
 
 done:
+	val := add(unsafe.Pointer(insertb), dataOffset+bucketCnt*2*sys.PtrSize+inserti*uintptr(t.valuesize))
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -690,7 +784,7 @@ done:
 
 func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast32))
 	}
 	if h == nil || h.count == 0 {
@@ -705,38 +799,32 @@ func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
-	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
+		growWork_fast32(t, h, bucket)
 	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
+			if key != *(*uint32)(k) || b.tophash[i] == empty {
 				continue
 			}
-			k := (*uint32)(add(unsafe.Pointer(b), dataOffset+i*4))
-			if key != *k {
-				continue
+			// Only clear key if there are pointers in it.
+			if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*4 + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -745,7 +833,7 @@ done:
 
 func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast64))
 	}
 	if h == nil || h.count == 0 {
@@ -760,38 +848,32 @@ func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
+		growWork_fast64(t, h, bucket)
 	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
+			if key != *(*uint64)(k) || b.tophash[i] == empty {
 				continue
 			}
-			k := (*uint64)(add(unsafe.Pointer(b), dataOffset+i*8))
-			if key != *k {
-				continue
+			// Only clear key if there are pointers in it.
+			if t.key.kind&kindNoPointers == 0 {
+				memclrHasPointers(k, t.key.size)
+			}
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*8 + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
@@ -800,7 +882,7 @@ done:
 
 func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	if raceenabled && h != nil {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
+		callerpc := getcallerpc()
 		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_faststr))
 	}
 	if h == nil || h.count == 0 {
@@ -816,43 +898,340 @@ func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
 	h.flags |= hashWriting
 
-	bucket := hash & (uintptr(1)<<h.B - 1)
+	bucket := hash & bucketMask(h.B)
 	if h.growing() {
-		growWork(t, h, bucket)
-	}
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (sys.PtrSize*8 - 8))
-	if top < minTopHash {
-		top += minTopHash
-	}
-	for {
-		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] != top {
-				continue
-			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
-			if k.len != key.len {
+		growWork_faststr(t, h, bucket)
+	}
+	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	top := tophash(hash)
+search:
+	for ; b != nil; b = b.overflow(t) {
+		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
+			k := (*stringStruct)(kptr)
+			if k.len != key.len || b.tophash[i] != top {
 				continue
 			}
 			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
 				continue
 			}
-			typedmemclr(t.key, unsafe.Pointer(k))
-			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*2*sys.PtrSize + i*uintptr(t.valuesize))
-			typedmemclr(t.elem, v)
+			// Clear key's pointer.
+			k.str = nil
+			// Only clear value if there are pointers in it.
+			if t.elem.kind&kindNoPointers == 0 {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+				memclrHasPointers(v, t.elem.size)
+			}
 			b.tophash[i] = empty
 			h.count--
-			goto done
-		}
-		b = b.overflow(t)
-		if b == nil {
-			goto done
+			break search
 		}
 	}
 
-done:
 	if h.flags&hashWriting == 0 {
 		throw("concurrent map writes")
 	}
 	h.flags &^= hashWriting
 }
+
+func growWork_fast32(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast32(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast32(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast32(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*4)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*4)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*4)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 4), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.hashfn(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*4)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if sys.PtrSize == 4 && t.key.kind&kindNoPointers == 0 && writeBarrier.enabled {
+					writebarrierptr((*uintptr)(dst.k), *(*uintptr)(k))
+				} else {
+					*(*uint32)(dst.k) = *(*uint32)(k)
+				}
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 4)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func growWork_fast64(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_fast64(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_fast64(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_fast64(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*8)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*8)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*8)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 8), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.hashfn(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*8)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				if t.key.kind&kindNoPointers == 0 && writeBarrier.enabled {
+					if sys.PtrSize == 8 {
+						writebarrierptr((*uintptr)(dst.k), *(*uintptr)(k))
+					} else {
+						// There are three ways to squeeze at least one 32 bit pointer into 64 bits.
+						// Give up and call typedmemmove.
+						typedmemmove(t.key, dst.k, k)
+					}
+				} else {
+					*(*uint64)(dst.k) = *(*uint64)(k)
+				}
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 8)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
+
+func growWork_faststr(t *maptype, h *hmap, bucket uintptr) {
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate_faststr(t, h, bucket&h.oldbucketmask())
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.growing() {
+		evacuate_faststr(t, h, h.nevacuate)
+	}
+}
+
+func evacuate_faststr(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := h.noldbuckets()
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		// xy contains the x and y (low and high) evacuation destinations.
+		var xy [2]evacDst
+		x := &xy[0]
+		x.b = (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		x.k = add(unsafe.Pointer(x.b), dataOffset)
+		x.v = add(x.k, bucketCnt*2*sys.PtrSize)
+
+		if !h.sameSizeGrow() {
+			// Only calculate y pointers if we're growing bigger.
+			// Otherwise GC can see bad pointers.
+			y := &xy[1]
+			y.b = (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+			y.k = add(unsafe.Pointer(y.b), dataOffset)
+			y.v = add(y.k, bucketCnt*2*sys.PtrSize)
+		}
+
+		for ; b != nil; b = b.overflow(t) {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*2*sys.PtrSize)
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 2*sys.PtrSize), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					throw("bad map state")
+				}
+				var useY uint8
+				if !h.sameSizeGrow() {
+					// Compute hash to make our evacuation decision (whether we need
+					// to send this key/value to bucket x or bucket y).
+					hash := t.key.hashfn(k, uintptr(h.hash0))
+					if hash&newbit != 0 {
+						useY = 1
+					}
+				}
+
+				b.tophash[i] = evacuatedX + useY // evacuatedX + 1 == evacuatedY, enforced in makemap
+				dst := &xy[useY]                 // evacuation destination
+
+				if dst.i == bucketCnt {
+					dst.b = h.newoverflow(t, dst.b)
+					dst.i = 0
+					dst.k = add(unsafe.Pointer(dst.b), dataOffset)
+					dst.v = add(dst.k, bucketCnt*2*sys.PtrSize)
+				}
+				dst.b.tophash[dst.i&(bucketCnt-1)] = top // mask dst.i as an optimization, to avoid a bounds check
+
+				// Copy key.
+				*(*string)(dst.k) = *(*string)(k)
+
+				typedmemmove(t.elem, dst.v, v)
+				dst.i++
+				// These updates might push these pointers past the end of the
+				// key or value arrays.  That's ok, as we have the overflow pointer
+				// at the end of the bucket to protect against pointing past the
+				// end of the bucket.
+				dst.k = add(dst.k, 2*sys.PtrSize)
+				dst.v = add(dst.v, uintptr(t.valuesize))
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 && t.bucket.kind&kindNoPointers == 0 {
+			b := add(h.oldbuckets, oldbucket*uintptr(t.bucketsize))
+			// Preserve b.tophash because the evacuation
+			// state is maintained there.
+			ptr := add(b, dataOffset)
+			n := uintptr(t.bucketsize) - dataOffset
+			memclrHasPointers(ptr, n)
+		}
+	}
+
+	if oldbucket == h.nevacuate {
+		advanceEvacuationMark(h, t, newbit)
+	}
+}
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index 166199b5ca3..a4b168d7313 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -200,7 +200,6 @@ func dumptype(t *_type) {
 
 // dump an object
 func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
-	dumpbvtypes(&bv, obj)
 	dumpint(tagObject)
 	dumpint(uint64(uintptr(obj)))
 	dumpmemrange(obj, size)
@@ -539,16 +538,6 @@ func dumpfields(bv bitvector) {
 	dumpint(fieldKindEol)
 }
 
-// The heap dump reader needs to be able to disambiguate
-// Eface entries. So it needs to know every type that might
-// appear in such an entry. The following routine accomplishes that.
-// TODO(rsc, khr): Delete - no longer possible.
-
-// Dump all the types that appear in the type field of
-// any Eface described by this bit vector.
-func dumpbvtypes(bv *bitvector, base unsafe.Pointer) {
-}
-
 func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	// Extend the temp buffer if necessary.
 	nptr := size / sys.PtrSize
diff --git a/libgo/go/runtime/internal/atomic/atomic_test.go b/libgo/go/runtime/internal/atomic/atomic_test.go
index 879a82f9c82..b697aa8bd39 100644
--- a/libgo/go/runtime/internal/atomic/atomic_test.go
+++ b/libgo/go/runtime/internal/atomic/atomic_test.go
@@ -52,7 +52,7 @@ func TestXadduintptr(t *testing.T) {
 // Tests that xadduintptr correctly updates 64-bit values. The place where
 // we actually do so is mstats.go, functions mSysStat{Inc,Dec}.
 func TestXadduintptrOnUint64(t *testing.T) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		// On big endian architectures, we never use xadduintptr to update
 		// 64-bit values and hence we skip the test.  (Note that functions
 		// mSysStat{Inc,Dec} in mstats.go have explicit checks for
diff --git a/libgo/go/runtime/internal/sys/sys.go b/libgo/go/runtime/internal/sys/sys.go
index 586a763717d..9d9ac4507f6 100644
--- a/libgo/go/runtime/internal/sys/sys.go
+++ b/libgo/go/runtime/internal/sys/sys.go
@@ -6,9 +6,9 @@
 // constants used by the runtime.
 package sys
 
-// The next line makes 'go generate' write the zgen_*.go files with
+// The next line makes 'go generate' write the zgo*.go files with
 // per-OS and per-arch information, including constants
-// named goos_$GOOS and goarch_$GOARCH for every
+// named Goos$GOOS and Goarch$GOARCH for every
 // known GOOS and GOARCH. The constant is 1 on the
 // current system, 0 otherwise; multiplying by them is
 // useful for defining GOOS- or GOARCH-specific constants.
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 52a2376dc5e..d000b112f44 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -83,7 +83,7 @@ Loop:
 			// for this lock, chained through m->nextwaitm.
 			// Queue this M.
 			for {
-				gp.m.nextwaitm = v &^ mutex_locked
+				gp.m.nextwaitm = muintptr(v &^ mutex_locked)
 				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|mutex_locked) {
 					break
 				}
@@ -115,8 +115,8 @@ func unlock(l *mutex) {
 		} else {
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
-			mp = (*m)(unsafe.Pointer(v &^ mutex_locked))
-			if atomic.Casuintptr(&l.key, v, mp.nextwaitm) {
+			mp = muintptr(v &^ mutex_locked).ptr()
+			if atomic.Casuintptr(&l.key, v, uintptr(mp.nextwaitm)) {
 				// Dequeued an M.  Wake it.
 				semawakeup(mp)
 				break
@@ -152,7 +152,7 @@ func notewakeup(n *note) {
 	case v == 0:
 		// Nothing was waiting. Done.
 	case v == mutex_locked:
-		// Two notewakeups!  Not allowed.
+		// Two notewakeups! Not allowed.
 		throw("notewakeup - double wakeup")
 	default:
 		// Must be the waiting m. Wake it up.
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index 796cd8a7c64..88e4ba3657b 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -546,9 +546,8 @@ func nextFreeFast(s *mspan) gclinkptr {
 			}
 			s.allocCache >>= uint(theBit + 1)
 			s.freeindex = freeidx
-			v := gclinkptr(result*s.elemsize + s.base())
 			s.allocCount++
-			return v
+			return gclinkptr(result*s.elemsize + s.base())
 		}
 	}
 	return 0
@@ -877,6 +876,9 @@ func reflect_unsafe_New(typ *_type) unsafe.Pointer {
 
 // newarray allocates an array of n elements of type typ.
 func newarray(typ *_type, n int) unsafe.Pointer {
+	if n == 1 {
+		return mallocgc(typ.size, typ, true)
+	}
 	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
 		panic(plainError("runtime: allocation size out of range"))
 	}
@@ -893,11 +895,13 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 	mProf_Malloc(x, size)
 }
 
-// nextSample returns the next sampling point for heap profiling.
-// It produces a random variable with a geometric distribution and
-// mean MemProfileRate. This is done by generating a uniformly
-// distributed random number and applying the cumulative distribution
-// function for an exponential.
+// nextSample returns the next sampling point for heap profiling. The goal is
+// to sample allocations on average every MemProfileRate bytes, but with a
+// completely random distribution over the allocation timeline; this
+// corresponds to a Poisson process with parameter MemProfileRate. In Poisson
+// processes, the distance between two samples follows the exponential
+// distribution (exp(MemProfileRate)), so the best return value is a random
+// number taken from an exponential distribution whose mean is MemProfileRate.
 func nextSample() int32 {
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
@@ -906,25 +910,29 @@ func nextSample() int32 {
 		}
 	}
 
-	period := MemProfileRate
+	return fastexprand(MemProfileRate)
+}
 
-	// make nextSample not overflow. Maximum possible step is
-	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+// fastexprand returns a random number from an exponential distribution with
+// the specified mean.
+func fastexprand(mean int) int32 {
+	// Avoid overflow. Maximum possible step is
+	// -ln(1/(1<<randomBitCount)) * mean, approximately 20 * mean.
 	switch {
-	case period > 0x7000000:
-		period = 0x7000000
-	case period == 0:
+	case mean > 0x7000000:
+		mean = 0x7000000
+	case mean == 0:
 		return 0
 	}
 
-	// Let m be the sample rate,
-	// the probability distribution function is m*exp(-mx), so the CDF is
-	// p = 1 - exp(-mx), so
-	// q = 1 - p == exp(-mx)
-	// log_e(q) = -mx
-	// -log_e(q)/m = x
-	// x = -log_e(q) * period
-	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	// Take a random sample of the exponential distribution exp(-mean*x).
+	// The probability distribution function is mean*exp(-mean*x), so the CDF is
+	// p = 1 - exp(-mean*x), so
+	// q = 1 - p == exp(-mean*x)
+	// log_e(q) = -mean*x
+	// -log_e(q)/mean = x
+	// x = -log_e(q) * mean
+	// x = log_2(q) * (-log_e(2)) * mean    ; Using log_2 for efficiency
 	const randomBitCount = 26
 	q := fastrand()%(1<<randomBitCount) + 1
 	qlog := fastlog2(float64(q)) - randomBitCount
@@ -932,7 +940,7 @@ func nextSample() int32 {
 		qlog = 0
 	}
 	const minusLog2 = -0.6931471805599453 // -ln(2)
-	return int32(qlog*(minusLog2*float64(period))) + 1
+	return int32(qlog*(minusLog2*float64(mean))) + 1
 }
 
 // nextSampleNoFP is similar to nextSample, but uses older,
@@ -950,7 +958,7 @@ func nextSampleNoFP() int32 {
 }
 
 type persistentAlloc struct {
-	base unsafe.Pointer
+	base *notInHeap
 	off  uintptr
 }
 
@@ -967,17 +975,17 @@ var globalAlloc struct {
 //
 // Consider marking persistentalloc'd types go:notinheap.
 func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
-	var p unsafe.Pointer
+	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
 	})
-	return p
+	return unsafe.Pointer(p)
 }
 
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	const (
 		chunk    = 256 << 10
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
@@ -998,7 +1006,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 
 	if size >= maxBlock {
-		return sysAlloc(size, sysStat)
+		return (*notInHeap)(sysAlloc(size, sysStat))
 	}
 
 	mp := acquirem()
@@ -1011,7 +1019,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 	persistent.off = round(persistent.off, align)
 	if persistent.off+size > chunk || persistent.base == nil {
-		persistent.base = sysAlloc(chunk, &memstats.other_sys)
+		persistent.base = (*notInHeap)(sysAlloc(chunk, &memstats.other_sys))
 		if persistent.base == nil {
 			if persistent == &globalAlloc.persistentAlloc {
 				unlock(&globalAlloc.mutex)
@@ -1020,7 +1028,7 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		persistent.off = 0
 	}
-	p := add(persistent.base, persistent.off)
+	p := persistent.base.add(persistent.off)
 	persistent.off += size
 	releasem(mp)
 	if persistent == &globalAlloc.persistentAlloc {
@@ -1033,3 +1041,19 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 	}
 	return p
 }
+
+// notInHeap is off-heap memory allocated by a lower-level allocator
+// like sysAlloc or persistentAlloc.
+//
+// In general, it's better to use real types marked as go:notinheap,
+// but this serves as a generic type for situations where that isn't
+// possible (like in the allocators).
+//
+// TODO: Use this as the return type of sysAlloc, persistentAlloc, etc?
+//
+//go:notinheap
+type notInHeap struct{}
+
+func (p *notInHeap) add(bytes uintptr) *notInHeap {
+	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes))
+}
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index 0d43cf65976..ab580f81800 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -48,9 +48,6 @@ func TestMemStats(t *testing.T) {
 	}
 	// Of the uint fields, HeapReleased, HeapIdle can be 0.
 	// PauseTotalNs can be 0 if timer resolution is poor.
-	//
-	// TODO: Test that GCCPUFraction is <= 0.99. This currently
-	// fails on windows/386. (Issue #19319)
 	fields := map[string][]func(interface{}) error{
 		"Alloc": {nz, le(1e10)}, "TotalAlloc": {nz, le(1e11)}, "Sys": {nz, le(1e10)},
 		"Lookups": {nz, le(1e10)}, "Mallocs": {nz, le(1e10)}, "Frees": {nz, le(1e10)},
@@ -63,7 +60,7 @@ func TestMemStats(t *testing.T) {
 		"NextGC": {nz, le(1e10)}, "LastGC": {nz},
 		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil,
 		"NumGC": {nz, le(1e9)}, "NumForcedGC": {nz, le(1e9)},
-		"GCCPUFraction": nil, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
+		"GCCPUFraction": {le(0.99)}, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
 		"BySize": nil,
 	}
 
diff --git a/libgo/go/runtime/map_test.go b/libgo/go/runtime/map_test.go
index 37c959f8327..6d7097e07ef 100644
--- a/libgo/go/runtime/map_test.go
+++ b/libgo/go/runtime/map_test.go
@@ -249,7 +249,7 @@ func testConcurrentReadsAfterGrowth(t *testing.T, useReflect bool) {
 	numGrowStep := 250
 	numReader := 16
 	if testing.Short() {
-		numLoop, numGrowStep = 2, 500
+		numLoop, numGrowStep = 2, 100
 	}
 	for i := 0; i < numLoop; i++ {
 		m := make(map[int]int, 0)
@@ -603,6 +603,142 @@ func TestIgnoreBogusMapHint(t *testing.T) {
 	}
 }
 
+var mapSink map[int]int
+
+var mapBucketTests = [...]struct {
+	n        int // n is the number of map elements
+	noescape int // number of expected buckets for non-escaping map
+	escape   int // number of expected buckets for escaping map
+}{
+	{-(1 << 30), 1, 1},
+	{-1, 1, 1},
+	{0, 1, 1},
+	{1, 1, 1},
+	{8, 1, 1},
+	{9, 2, 2},
+	{13, 2, 2},
+	{14, 4, 4},
+	{26, 4, 4},
+}
+
+func TestMapBuckets(t *testing.T) {
+	// Test that maps of different sizes have the right number of buckets.
+	// Non-escaping maps with small buckets (like map[int]int) never
+	// have a nil bucket pointer due to starting with preallocated buckets
+	// on the stack. Escaping maps start with a non-nil bucket pointer if
+	// hint size is above bucketCnt and thereby have more than one bucket.
+	// These tests depend on bucketCnt and loadFactor* in hashmap.go.
+	t.Run("mapliteral", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := map[int]int{}
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := map[int]int{}
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("nohint", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int)
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("makemap", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int, tt.n)
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int, tt.n)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+	t.Run("makemap64", func(t *testing.T) {
+		for _, tt := range mapBucketTests {
+			localMap := make(map[int]int, int64(tt.n))
+			// Skip test on gccgo until escape analysis is
+			// turned on.
+			if runtime.MapBucketsPointerIsNil(localMap) && runtime.Compiler != "gccgo" {
+				t.Errorf("no escape: buckets pointer is nil for non-escaping map")
+			}
+			for i := 0; i < tt.n; i++ {
+				localMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(localMap); got != tt.noescape {
+				t.Errorf("no escape: n=%d want %d buckets, got %d", tt.n, tt.noescape, got)
+			}
+			escapingMap := make(map[int]int, tt.n)
+			if count := runtime.MapBucketsCount(escapingMap); count > 1 && runtime.MapBucketsPointerIsNil(escapingMap) {
+				t.Errorf("escape: buckets pointer is nil for n=%d buckets", count)
+			}
+			for i := 0; i < tt.n; i++ {
+				escapingMap[i] = i
+			}
+			if got := runtime.MapBucketsCount(escapingMap); got != tt.escape {
+				t.Errorf("escape: n=%d want %d buckets, got %d", tt.n, tt.escape, got)
+			}
+			mapSink = escapingMap
+		}
+	})
+
+}
+
 func benchmarkMapPop(b *testing.B, n int) {
 	m := map[int]int{}
 	for i := 0; i < b.N; i++ {
@@ -624,15 +760,39 @@ func BenchmarkMapPop100(b *testing.B)   { benchmarkMapPop(b, 100) }
 func BenchmarkMapPop1000(b *testing.B)  { benchmarkMapPop(b, 1000) }
 func BenchmarkMapPop10000(b *testing.B) { benchmarkMapPop(b, 10000) }
 
+var testNonEscapingMapVariable int = 8
+
 func TestNonEscapingMap(t *testing.T) {
 	t.Skip("does not work on gccgo without better escape analysis")
 	n := testing.AllocsPerRun(1000, func() {
+		m := map[int]int{}
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("mapliteral: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
 		m := make(map[int]int)
 		m[0] = 0
 	})
 	if n != 0 {
-		t.Fatalf("want 0 allocs, got %v", n)
+		t.Fatalf("no hint: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
+		m := make(map[int]int, 8)
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("with small hint: want 0 allocs, got %v", n)
+	}
+	n = testing.AllocsPerRun(1000, func() {
+		m := make(map[int]int, testNonEscapingMapVariable)
+		m[0] = 0
+	})
+	if n != 0 {
+		t.Fatalf("with variable hint: want 0 allocs, got %v", n)
 	}
+
 }
 
 func benchmarkMapAssignInt32(b *testing.B, n int) {
@@ -643,12 +803,16 @@ func benchmarkMapAssignInt32(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteInt32(b *testing.B, n int) {
-	a := make(map[int32]int)
-	for i := 0; i < n*b.N; i++ {
-		a[int32(i)] = i
-	}
+	a := make(map[int32]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := i; j < i+n; j++ {
+				a[int32(j)] = j
+			}
+			b.StartTimer()
+		}
 		delete(a, int32(i))
 	}
 }
@@ -661,12 +825,16 @@ func benchmarkMapAssignInt64(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteInt64(b *testing.B, n int) {
-	a := make(map[int64]int)
-	for i := 0; i < n*b.N; i++ {
-		a[int64(i)] = i
-	}
+	a := make(map[int64]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := i; j < i+n; j++ {
+				a[int64(j)] = j
+			}
+			b.StartTimer()
+		}
 		delete(a, int64(i))
 	}
 }
@@ -684,17 +852,23 @@ func benchmarkMapAssignStr(b *testing.B, n int) {
 }
 
 func benchmarkMapDeleteStr(b *testing.B, n int) {
-	k := make([]string, n*b.N)
-	for i := 0; i < n*b.N; i++ {
-		k[i] = strconv.Itoa(i)
-	}
-	a := make(map[string]int)
-	for i := 0; i < n*b.N; i++ {
-		a[k[i]] = i
+	i2s := make([]string, n)
+	for i := 0; i < n; i++ {
+		i2s[i] = strconv.Itoa(i)
 	}
+	a := make(map[string]int, n)
 	b.ResetTimer()
-	for i := 0; i < n*b.N; i = i + n {
-		delete(a, k[i])
+	k := 0
+	for i := 0; i < b.N; i++ {
+		if len(a) == 0 {
+			b.StopTimer()
+			for j := 0; j < n; j++ {
+				a[i2s[j]] = j
+			}
+			k = i
+			b.StartTimer()
+		}
+		delete(a, i2s[i-k])
 	}
 }
 
@@ -713,7 +887,7 @@ func BenchmarkMapAssign(b *testing.B) {
 }
 
 func BenchmarkMapDelete(b *testing.B) {
-	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 1, 2, 4))
-	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 1, 2, 4))
-	b.Run("Str", runWith(benchmarkMapDeleteStr, 1, 2, 4))
+	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 100, 1000, 10000))
+	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 100, 1000, 10000))
+	b.Run("Str", runWith(benchmarkMapDeleteStr, 100, 1000, 10000))
 }
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
index d54016f0ba9..3b8f71434b8 100644
--- a/libgo/go/runtime/mbarrier.go
+++ b/libgo/go/runtime/mbarrier.go
@@ -189,6 +189,8 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) {
 func writebarrierptr_prewrite1(dst *uintptr, src uintptr) {
 	mp := acquirem()
 	if mp.inwb || mp.dying > 0 {
+		// We explicitly allow write barriers in startpanic_m,
+		// since we're going down anyway. Ignore them here.
 		releasem(mp)
 		return
 	}
@@ -244,6 +246,10 @@ func writebarrierptr_prewrite(dst *uintptr, src uintptr) {
 
 // typedmemmove copies a value of type t to dst from src.
 // Must be nosplit, see #16026.
+//
+// TODO: Perfect for go:nosplitrec since we can't have a safe point
+// anywhere in the bulk barrier or memmove.
+//
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if typ.kind&kindNoPointers == 0 {
@@ -265,8 +271,8 @@ func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 //go:linkname reflect_typedmemmove reflect.typedmemmove
 func reflect_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if raceenabled {
-		raceWriteObjectPC(typ, dst, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
-		raceReadObjectPC(typ, src, getcallerpc(unsafe.Pointer(&typ)), funcPC(reflect_typedmemmove))
+		raceWriteObjectPC(typ, dst, getcallerpc(), funcPC(reflect_typedmemmove))
+		raceReadObjectPC(typ, src, getcallerpc(), funcPC(reflect_typedmemmove))
 	}
 	if msanenabled {
 		msanwrite(dst, typ.size)
@@ -310,8 +316,12 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 	dstp := dst.array
 	srcp := src.array
 
+	// The compiler emits calls to typedslicecopy before
+	// instrumentation runs, so unlike the other copying and
+	// assignment operations, it's not instrumented in the calling
+	// code and needs its own instrumentation.
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&typ))
+		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
 		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
 		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
@@ -329,41 +339,13 @@ func typedslicecopy(typ *_type, dst, src slice) int {
 	// compiler only emits calls to typedslicecopy for types with pointers,
 	// and growslice and reflect_typedslicecopy check for pointers
 	// before calling typedslicecopy.
-	if !writeBarrier.needed {
-		memmove(dstp, srcp, uintptr(n)*typ.size)
-		return n
+	size := uintptr(n) * typ.size
+	if writeBarrier.needed {
+		bulkBarrierPreWrite(uintptr(dstp), uintptr(srcp), size)
 	}
-
-	systemstack(func() {
-		if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
-			// Overlap with src before dst.
-			// Copy backward, being careful not to move dstp/srcp
-			// out of the array they point into.
-			dstp = add(dstp, uintptr(n-1)*typ.size)
-			srcp = add(srcp, uintptr(n-1)*typ.size)
-			i := 0
-			for {
-				typedmemmove(typ, dstp, srcp)
-				if i++; i >= n {
-					break
-				}
-				dstp = add(dstp, -typ.size)
-				srcp = add(srcp, -typ.size)
-			}
-		} else {
-			// Copy forward, being careful not to move dstp/srcp
-			// out of the array they point into.
-			i := 0
-			for {
-				typedmemmove(typ, dstp, srcp)
-				if i++; i >= n {
-					break
-				}
-				dstp = add(dstp, typ.size)
-				srcp = add(srcp, typ.size)
-			}
-		}
-	})
+	// See typedmemmove for a discussion of the race between the
+	// barrier and memmove.
+	memmove(dstp, srcp, size)
 	return n
 }
 
@@ -380,7 +362,7 @@ func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
 
 		size := uintptr(n) * elemType.size
 		if raceenabled {
-			callerpc := getcallerpc(unsafe.Pointer(&elemType))
+			callerpc := getcallerpc()
 			pc := funcPC(reflect_typedslicecopy)
 			racewriterangepc(dst.array, size, callerpc, pc)
 			racereadrangepc(src.array, size, callerpc, pc)
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index d1a58202352..a775b57b033 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -463,11 +463,6 @@ func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr,
 	return
 }
 
-// prefetch the bits.
-func (h heapBits) prefetch() {
-	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
-}
-
 // next returns the heapBits describing the next pointer-sized word in memory.
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
@@ -542,12 +537,13 @@ func (h heapBits) setCheckmarked(size uintptr) {
 	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
 }
 
-// bulkBarrierPreWrite executes writebarrierptr_prewrite1
+// bulkBarrierPreWrite executes a write barrier
 // for every pointer slot in the memory range [src, src+size),
 // using pointer/scalar information from [dst, dst+size).
 // This executes the write barriers necessary before a memmove.
 // src, dst, and size must be pointer-aligned.
 // The range [dst, dst+size) must lie within a single object.
+// It does not perform the actual writes.
 //
 // As a special case, src == 0 indicates that this is being used for a
 // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
@@ -593,12 +589,15 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 		return
 	}
 
+	buf := &getg().m.p.ptr().wbBuf
 	h := heapBitsForAddr(dst)
 	if src == 0 {
 		for i := uintptr(0); i < size; i += sys.PtrSize {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -607,7 +606,9 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -627,6 +628,7 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 	bits = addb(bits, word/8)
 	mask := uint8(1) << (word % 8)
 
+	buf := &getg().m.p.ptr().wbBuf
 	for i := uintptr(0); i < size; i += sys.PtrSize {
 		if mask == 0 {
 			bits = addb(bits, 1)
@@ -640,10 +642,14 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 		if *bits&mask != 0 {
 			dstx := (*uintptr)(unsafe.Pointer(dst + i))
 			if src == 0 {
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			} else {
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 		}
 		mask <<= 1
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index 71a2f22114f..766cfd17523 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -96,7 +96,7 @@ func freemcache(c *mcache) {
 
 // Gets a span that has a free object in it and assigns it
 // to be the cached span for the given sizeclass. Returns this span.
-func (c *mcache) refill(spc spanClass) *mspan {
+func (c *mcache) refill(spc spanClass) {
 	_g_ := getg()
 
 	_g_.m.locks++
@@ -123,7 +123,6 @@ func (c *mcache) refill(spc spanClass) *mspan {
 
 	c.alloc[spc] = s
 	_g_.m.locks--
-	return s
 }
 
 func (c *mcache) releaseAll() {
diff --git a/libgo/go/runtime/mem_gccgo.go b/libgo/go/runtime/mem_gccgo.go
index ea3e5ebab4e..a087945251f 100644
--- a/libgo/go/runtime/mem_gccgo.go
+++ b/libgo/go/runtime/mem_gccgo.go
@@ -13,9 +13,10 @@ import (
 
 // Functions called by C code.
 //go:linkname sysAlloc runtime.sysAlloc
+//go:linkname sysFree runtime.sysFree
 
 //extern mmap
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) unsafe.Pointer
+func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) unsafe.Pointer
 
 //extern munmap
 func munmap(addr unsafe.Pointer, length uintptr) int32
@@ -40,6 +41,14 @@ func init() {
 	}
 }
 
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uintptr) (unsafe.Pointer, int) {
+	p := sysMmap(addr, n, prot, flags, fd, off)
+	if uintptr(p) == _MAP_FAILED {
+		return nil, errno()
+	}
+	return p, 0
+}
+
 // NOTE: vec must be just 1 byte long here.
 // Mincore returns ENOMEM if any of the pages are unmapped,
 // but we want to know that all of the pages are unmapped.
@@ -75,31 +84,30 @@ func addrspace_free(v unsafe.Pointer, n uintptr) bool {
 	return true
 }
 
-func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uintptr) unsafe.Pointer {
-	p := mmap(v, n, prot, flags, fd, offset)
+func mmap_fixed(v unsafe.Pointer, n uintptr, prot, flags, fd int32, offset uintptr) (unsafe.Pointer, int) {
+	p, err := mmap(v, n, prot, flags, fd, offset)
 	// On some systems, mmap ignores v without
 	// MAP_FIXED, so retry if the address space is free.
 	if p != v && addrspace_free(v, n) {
-		if uintptr(p) != _MAP_FAILED {
+		if err == 0 {
 			munmap(p, n)
 		}
-		p = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
+		p, err = mmap(v, n, prot, flags|_MAP_FIXED, fd, offset)
 	}
-	return p
+	return p, err
 }
 
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	p := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
-	if uintptr(p) == _MAP_FAILED {
-		errval := errno()
-		if errval == _EACCES {
+	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if err != 0 {
+		if err == _EACCES {
 			print("runtime: mmap: access denied\n")
 			exit(2)
 		}
-		if errval == _EAGAIN {
+		if err == _EAGAIN {
 			print("runtime: mmap: too much locked memory (check 'ulimit -l').\n")
 			exit(2)
 		}
@@ -225,9 +233,9 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	// if we can reserve at least 64K and check the assumption in SysMap.
 	// Only user-mode Linux (UML) rejects these requests.
 	if sys.PtrSize == 8 && uint64(n) > 1<<32 {
-		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
-		if p != v {
-			if uintptr(p) != _MAP_FAILED {
+		p, err := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+		if p != v || err != 0 {
+			if err == 0 {
 				munmap(p, 64<<10)
 			}
 			return nil
@@ -237,8 +245,8 @@ func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 		return v
 	}
 
-	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
-	if uintptr(p) == _MAP_FAILED {
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, mmapFD, 0)
+	if err != 0 {
 		return nil
 	}
 	*reserved = true
@@ -259,12 +267,12 @@ func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 			// to do this - we do not on other platforms.
 			flags |= _MAP_FIXED
 		}
-		p := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, flags, mmapFD, 0)
-		if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+		p, err := mmap_fixed(v, n, _PROT_READ|_PROT_WRITE, flags, mmapFD, 0)
+		if err == _ENOMEM {
 			throw("runtime: out of memory")
 		}
-		if p != v {
-			print("runtime: address space conflict: map(", v, ") = ", p, "\n")
+		if p != v || err != 0 {
+			print("runtime: address space conflict: map(", v, ") = ", p, " (err ", err, ")\n")
 			throw("runtime: address space conflict")
 		}
 		return
@@ -275,11 +283,11 @@ func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 		// So always unmap first even if it is already unmapped.
 		munmap(v, n)
 	}
-	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, mmapFD, 0)
-	if uintptr(p) == _MAP_FAILED && errno() == _ENOMEM {
+	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, mmapFD, 0)
+	if err == _ENOMEM {
 		throw("runtime: out of memory")
 	}
-	if p != v {
+	if p != v || err != 0 {
 		throw("runtime: cannot map pages in arena address space")
 	}
 }
diff --git a/libgo/go/runtime/memmove_test.go b/libgo/go/runtime/memmove_test.go
index 74b8753b5f7..62de604e69c 100644
--- a/libgo/go/runtime/memmove_test.go
+++ b/libgo/go/runtime/memmove_test.go
@@ -9,6 +9,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"internal/race"
+	"internal/testenv"
 	. "runtime"
 	"testing"
 )
@@ -88,6 +89,10 @@ func TestMemmoveAlias(t *testing.T) {
 }
 
 func TestMemmoveLarge0x180000(t *testing.T) {
+	if testing.Short() && testenv.Builder() == "" {
+		t.Skip("-short")
+	}
+
 	t.Parallel()
 	if race.Enabled {
 		t.Skip("skipping large memmove test under race detector")
@@ -96,6 +101,10 @@ func TestMemmoveLarge0x180000(t *testing.T) {
 }
 
 func TestMemmoveOverlapLarge0x120000(t *testing.T) {
+	if testing.Short() && testenv.Builder() == "" {
+		t.Skip("-short")
+	}
+
 	t.Parallel()
 	if race.Enabled {
 		t.Skip("skipping large memmove test under race detector")
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
index 4353ee57569..19573d8b8d3 100644
--- a/libgo/go/runtime/mfinal.go
+++ b/libgo/go/runtime/mfinal.go
@@ -419,11 +419,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 	return
 }
 
-// Mark KeepAlive as noinline so that the current compiler will ensure
-// that the argument is alive at the point of the function call.
-// If it were inlined, it would disappear, and there would be nothing
-// keeping the argument alive. Perhaps a future compiler will recognize
-// runtime.KeepAlive specially and do something more efficient.
+// Mark KeepAlive as noinline so that it is easily detectable as an intrinsic.
 //go:noinline
 
 // KeepAlive marks its argument as currently reachable.
@@ -445,4 +441,11 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 // Without the KeepAlive call, the finalizer could run at the start of
 // syscall.Read, closing the file descriptor before syscall.Read makes
 // the actual system call.
-func KeepAlive(interface{}) {}
+func KeepAlive(x interface{}) {
+	// Introduce a use of x that the compiler can't eliminate.
+	// This makes sure x is alive on entry. We need x to be alive
+	// on entry for "defer runtime.KeepAlive(x)"; see issue 21402.
+	if cgoAlwaysFalse {
+		println(x)
+	}
+}
diff --git a/libgo/go/runtime/mfinal_test.go b/libgo/go/runtime/mfinal_test.go
index 38c2623bb7b..2086e42ba33 100644
--- a/libgo/go/runtime/mfinal_test.go
+++ b/libgo/go/runtime/mfinal_test.go
@@ -254,3 +254,24 @@ var (
 	Foo2 = &Object2{}
 	Foo1 = &Object1{}
 )
+
+func TestDeferKeepAlive(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
+	// See issue 21402.
+	t.Parallel()
+	type T *int // needs to be a pointer base type to avoid tinyalloc and its never-finalized behavior.
+	x := new(T)
+	finRun := false
+	runtime.SetFinalizer(x, func(x *T) {
+		finRun = true
+	})
+	defer runtime.KeepAlive(x)
+	runtime.GC()
+	time.Sleep(time.Second)
+	if finRun {
+		t.Errorf("finalizer ran prematurely")
+	}
+}
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index 31c4be86fe4..626f088d450 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -231,6 +231,24 @@ func setGCPercent(in int32) (out int32) {
 	// Update pacing in response to gcpercent change.
 	gcSetTriggerRatio(memstats.triggerRatio)
 	unlock(&mheap_.lock)
+
+	// If we just disabled GC, wait for any concurrent GC to
+	// finish so we always return with no GC running.
+	if in < 0 {
+		// Disable phase transitions.
+		lock(&work.sweepWaiters.lock)
+		if gcphase == _GCmark {
+			// GC is active. Wait until we reach sweeping.
+			gp := getg()
+			gp.schedlink = work.sweepWaiters.head
+			work.sweepWaiters.head.set(gp)
+			goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+		} else {
+			// GC isn't active.
+			unlock(&work.sweepWaiters.lock)
+		}
+	}
+
 	return out
 }
 
@@ -300,10 +318,10 @@ const (
 
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
-	// is necessary when GOMAXPROCS*gcGoalUtilization is not an
-	// integer. The fractional worker should run until it is
+	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
+	// an integer. The fractional worker should run until it is
 	// preempted and will be scheduled to pick up the fractional
-	// part of GOMAXPROCS*gcGoalUtilization.
+	// part of GOMAXPROCS*gcBackgroundUtilization.
 	gcMarkWorkerFractionalMode
 
 	// gcMarkWorkerIdleMode indicates that a P is running the mark
@@ -397,23 +415,18 @@ type gcControllerState struct {
 	assistBytesPerWork float64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
-	// time that should be spent in the fractional mark worker.
-	// For example, if the overall mark utilization goal is 25%
-	// and GOMAXPROCS is 6, one P will be a dedicated mark worker
-	// and this will be set to 0.5 so that 50% of the time some P
-	// is in a fractional mark worker. This is computed at the
-	// beginning of each cycle.
+	// time that should be spent in the fractional mark worker on
+	// each P that isn't running a dedicated worker.
+	//
+	// For example, if the utilization goal is 25% and there are
+	// no dedicated workers, this will be 0.25. If there goal is
+	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
+	// this will be 0.05 to make up the missing 5%.
+	//
+	// If this is zero, no fractional workers are needed.
 	fractionalUtilizationGoal float64
 
 	_ [sys.CacheLineSize]byte
-
-	// fractionalMarkWorkersNeeded is the number of fractional
-	// mark workers that need to be started. This is either 0 or
-	// 1. This is potentially updated atomically at every
-	// scheduling point (hence it gets its own cache line).
-	fractionalMarkWorkersNeeded int64
-
-	_ [sys.CacheLineSize]byte
 }
 
 // startCycle resets the GC controller's state and computes estimates
@@ -454,23 +467,33 @@ func (c *gcControllerState) startCycle() {
 		memstats.next_gc = memstats.heap_live + 1024*1024
 	}
 
-	// Compute the total mark utilization goal and divide it among
-	// dedicated and fractional workers.
-	totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
-	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal)
-	c.fractionalUtilizationGoal = totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)
-	if c.fractionalUtilizationGoal > 0 {
-		c.fractionalMarkWorkersNeeded = 1
+	// Compute the background mark utilization goal. In general,
+	// this may not come out exactly. We round the number of
+	// dedicated workers so that the utilization is closest to
+	// 25%. For small GOMAXPROCS, this would introduce too much
+	// error, so we add fractional workers in that case.
+	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
+	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	const maxUtilError = 0.3
+	if utilError < -maxUtilError || utilError > maxUtilError {
+		// Rounding put us more than 30% off our goal. With
+		// gcBackgroundUtilization of 25%, this happens for
+		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
+		// workers to compensate.
+		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+			// Too many dedicated workers.
+			c.dedicatedMarkWorkersNeeded--
+		}
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
 	} else {
-		c.fractionalMarkWorkersNeeded = 0
+		c.fractionalUtilizationGoal = 0
 	}
 
 	// Clear per-P state
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp {
 		p.gcAssistTime = 0
+		p.gcFractionalMarkTime = 0
 	}
 
 	// Compute initial values for controls that are updated
@@ -483,7 +506,7 @@ func (c *gcControllerState) startCycle() {
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
 			" workers=", c.dedicatedMarkWorkersNeeded,
-			"+", c.fractionalMarkWorkersNeeded, "\n")
+			"+", c.fractionalUtilizationGoal, "\n")
 	}
 }
 
@@ -496,47 +519,73 @@ func (c *gcControllerState) startCycle() {
 // is when assists are enabled and the necessary statistics are
 // available).
 func (c *gcControllerState) revise() {
-	// Compute the expected scan work remaining.
+	gcpercent := gcpercent
+	if gcpercent < 0 {
+		// If GC is disabled but we're running a forced GC,
+		// act like GOGC is huge for the below calculations.
+		gcpercent = 100000
+	}
+	live := atomic.Load64(&memstats.heap_live)
+
+	var heapGoal, scanWorkExpected int64
+	if live <= memstats.next_gc {
+		// We're under the soft goal. Pace GC to complete at
+		// next_gc assuming the heap is in steady-state.
+		heapGoal = int64(memstats.next_gc)
+
+		// Compute the expected scan work remaining.
+		//
+		// This is estimated based on the expected
+		// steady-state scannable heap. For example, with
+		// GOGC=100, only half of the scannable heap is
+		// expected to be live, so that's what we target.
+		//
+		// (This is a float calculation to avoid overflowing on
+		// 100*heap_scan.)
+		scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	} else {
+		// We're past the soft goal. Pace GC so that in the
+		// worst case it will complete by the hard goal.
+		const maxOvershoot = 1.1
+		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+
+		// Compute the upper bound on the scan work remaining.
+		scanWorkExpected = int64(memstats.heap_scan)
+	}
+
+	// Compute the remaining scan work estimate.
 	//
 	// Note that we currently count allocations during GC as both
 	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so this difference won't be changed by
-	// allocations during GC.
-	//
-	// This particular estimate is a strict upper bound on the
-	// possible remaining scan work for the current heap.
-	// You might consider dividing this by 2 (or by
-	// (100+GOGC)/100) to counter this over-estimation, but
-	// benchmarks show that this has almost no effect on mean
-	// mutator utilization, heap size, or assist time and it
-	// introduces the danger of under-estimating and letting the
-	// mutator outpace the garbage collector.
-	scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
-	if scanWorkExpected < 1000 {
+	// (scanWork), so allocation will change this difference will
+	// slowly in the soft regime and not at all in the hard
+	// regime.
+	scanWorkRemaining := scanWorkExpected - c.scanWork
+	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
 		// we can miss by a little.
 		//
 		// We *do* need to enforce that this is at least 1,
 		// since marking is racy and double-scanning objects
-		// may legitimately make the expected scan work
-		// negative.
-		scanWorkExpected = 1000
+		// may legitimately make the remaining scan work
+		// negative, even in the hard goal regime.
+		scanWorkRemaining = 1000
 	}
 
 	// Compute the heap distance remaining.
-	heapDistance := int64(memstats.next_gc) - int64(atomic.Load64(&memstats.heap_live))
-	if heapDistance <= 0 {
+	heapRemaining := heapGoal - int64(live)
+	if heapRemaining <= 0 {
 		// This shouldn't happen, but if it does, avoid
 		// dividing by zero or setting the assist negative.
-		heapDistance = 1
+		heapRemaining = 1
 	}
 
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
-	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
+	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
+	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
 }
 
 // endCycle computes the trigger ratio for the next cycle.
@@ -570,7 +619,7 @@ func (c *gcControllerState) endCycle() float64 {
 	assistDuration := nanotime() - c.markStartTime
 
 	// Assume background mark hit its utilization goal.
-	utilization := gcGoalUtilization
+	utilization := gcBackgroundUtilization
 	// Add assist utilization; avoid divide by zero.
 	if assistDuration > 0 {
 		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
@@ -689,51 +738,20 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 		// This P is now dedicated to marking until the end of
 		// the concurrent mark phase.
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+	} else if c.fractionalUtilizationGoal == 0 {
+		// No need for fractional workers.
+		return nil
 	} else {
-		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
-			// No more workers are need right now.
-			return nil
-		}
-
-		// This P has picked the token for the fractional worker.
-		// Is the GC currently under or at the utilization goal?
-		// If so, do more work.
-		//
-		// We used to check whether doing one time slice of work
-		// would remain under the utilization goal, but that has the
-		// effect of delaying work until the mutator has run for
-		// enough time slices to pay for the work. During those time
-		// slices, write barriers are enabled, so the mutator is running slower.
-		// Now instead we do the work whenever we're under or at the
-		// utilization work and pay for it by letting the mutator run later.
-		// This doesn't change the overall utilization averages, but it
-		// front loads the GC work so that the GC finishes earlier and
-		// write barriers can be turned off sooner, effectively giving
-		// the mutator a faster machine.
-		//
-		// The old, slower behavior can be restored by setting
-		//	gcForcePreemptNS = forcePreemptNS.
-		const gcForcePreemptNS = 0
-
-		// TODO(austin): We could fast path this and basically
-		// eliminate contention on c.fractionalMarkWorkersNeeded by
-		// precomputing the minimum time at which it's worth
-		// next scheduling the fractional worker. Then Ps
-		// don't have to fight in the window where we've
-		// passed that deadline and no one has started the
-		// worker yet.
+		// Is this P behind on the fractional utilization
+		// goal?
 		//
-		// TODO(austin): Shorter preemption interval for mark
-		// worker to improve fairness and give this
-		// finer-grained control over schedule?
-		now := nanotime() - gcController.markStartTime
-		then := now + gcForcePreemptNS
-		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
-		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
-			// Nope, we'd overshoot the utilization goal
-			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+		// This should be kept in sync with pollFractionalWorkerExit.
+		delta := nanotime() - gcController.markStartTime
+		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+			// Nope. No need to run a fractional worker.
 			return nil
 		}
+		// Run a fractional worker.
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
@@ -746,6 +764,24 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	return gp
 }
 
+// pollFractionalWorkerExit returns true if a fractional mark worker
+// should self-preempt. It assumes it is called from the fractional
+// worker.
+func pollFractionalWorkerExit() bool {
+	// This should be kept in sync with the fractional worker
+	// scheduler logic in findRunnableGCWorker.
+	now := nanotime()
+	delta := now - gcController.markStartTime
+	if delta <= 0 {
+		return true
+	}
+	p := getg().m.p.ptr()
+	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
+	// Add some slack to the utilization goal so that the
+	// fractional worker isn't behind again the instant it exits.
+	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
+}
+
 // gcSetTriggerRatio sets the trigger ratio and updates everything
 // derived from it: the absolute trigger, the heap goal, mark pacing,
 // and sweep pacing.
@@ -860,9 +896,22 @@ func gcSetTriggerRatio(triggerRatio float64) {
 	}
 }
 
-// gcGoalUtilization is the goal CPU utilization for background
+// gcGoalUtilization is the goal CPU utilization for
 // marking as a fraction of GOMAXPROCS.
-const gcGoalUtilization = 0.25
+const gcGoalUtilization = 0.30
+
+// gcBackgroundUtilization is the fixed CPU utilization for background
+// marking. It must be <= gcGoalUtilization. The difference between
+// gcGoalUtilization and gcBackgroundUtilization will be made up by
+// mark assists. The scheduler will aim to use within 50% of this
+// goal.
+//
+// Setting this to < gcGoalUtilization avoids saturating the trigger
+// feedback controller when there are no assists, which allows it to
+// better control CPU and heap growth. However, the larger the gap,
+// the more mutator assists are expected to happen, which impact
+// mutator latency.
+const gcBackgroundUtilization = 0.25
 
 // gcCreditSlack is the amount of scan work credit that can can
 // accumulate locally before updating gcController.scanWork and,
@@ -1159,7 +1208,7 @@ func (t gcTrigger) test() bool {
 	if t.kind == gcTriggerAlways {
 		return true
 	}
-	if gcphase != _GCoff || gcpercent < 0 {
+	if gcphase != _GCoff {
 		return false
 	}
 	switch t.kind {
@@ -1170,6 +1219,9 @@ func (t gcTrigger) test() bool {
 		// own write.
 		return memstats.heap_live >= memstats.gc_trigger
 	case gcTriggerTime:
+		if gcpercent < 0 {
+			return false
+		}
 		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
 		return lastgc != 0 && t.now-lastgc > forcegcperiod
 	case gcTriggerCycle:
@@ -1236,7 +1288,7 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		}
 	}
 
-	// Ok, we're doing it!  Stop everybody else
+	// Ok, we're doing it! Stop everybody else
 	semacquire(&worldsema)
 
 	if trace.enabled {
@@ -1249,7 +1301,12 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 
 	gcResetMarkState()
 
-	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
+	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
+	if work.stwprocs > ncpu {
+		// This is used to compute CPU time of the STW phases,
+		// so it can't be more than ncpu, even if GOMAXPROCS is.
+		work.stwprocs = ncpu
+	}
 	work.heap0 = atomic.Load64(&memstats.heap_live)
 	work.pauseNS = 0
 	work.mode = mode
@@ -1257,6 +1314,9 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 	now := nanotime()
 	work.tSweepTerm = now
 	work.pauseStart = now
+	if trace.enabled {
+		traceGCSTWStart(1)
+	}
 	systemstack(stopTheWorldWithSema)
 	// Finish sweep before we start concurrent scan.
 	systemstack(func() {
@@ -1309,11 +1369,17 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		gcController.markStartTime = now
 
 		// Concurrent mark.
-		systemstack(startTheWorldWithSema)
-		now = nanotime()
+		systemstack(func() {
+			now = startTheWorldWithSema(trace.enabled)
+		})
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
 	} else {
+		if trace.enabled {
+			// Switch to mark termination STW.
+			traceGCSTWDone()
+			traceGCSTWStart(0)
+		}
 		t := nanotime()
 		work.tMark, work.tMarkTerm = t, t
 		work.heapGoal = work.heap0
@@ -1356,7 +1422,8 @@ top:
 	// TODO(austin): Should dedicated workers keep an eye on this
 	// and exit gcDrain promptly?
 	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
-	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+	prevFractionalGoal := gcController.fractionalUtilizationGoal
+	gcController.fractionalUtilizationGoal = 0
 
 	if !gcBlackenPromptly {
 		// Transition from mark 1 to mark 2.
@@ -1383,6 +1450,7 @@ top:
 			// workers have exited their loop so we can
 			// start new mark 2 workers.
 			forEachP(func(_p_ *p) {
+				wbBufFlush1(_p_)
 				_p_.gcw.dispose()
 			})
 		})
@@ -1399,7 +1467,7 @@ top:
 
 		// Now we can start up mark 2 workers.
 		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
-		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
+		gcController.fractionalUtilizationGoal = prevFractionalGoal
 
 		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
@@ -1414,6 +1482,9 @@ top:
 		work.tMarkTerm = now
 		work.pauseStart = now
 		getg().m.preemptoff = "gcing"
+		if trace.enabled {
+			traceGCSTWStart(0)
+		}
 		systemstack(stopTheWorldWithSema)
 		// The gcphase is _GCmark, it will transition to _GCmarktermination
 		// below. The important thing is that the wb remains active until
@@ -1574,7 +1645,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// so events don't leak into the wrong cycle.
 	mProf_NextCycle()
 
-	systemstack(startTheWorldWithSema)
+	systemstack(func() { startTheWorldWithSema(true) })
 
 	// Flush the heap profile so we can start a new cycle next GC.
 	// This is relatively expensive, so we don't do it with the
@@ -1645,10 +1716,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 func gcBgMarkStartWorkers() {
 	// Background marking is performed by per-P G's. Ensure that
 	// each P has a background GC G.
-	for _, p := range &allp {
-		if p == nil || p.status == _Pdead {
-			break
-		}
+	for _, p := range allp {
 		if p.gcBgMarkWorker == 0 {
 			expectSystemGoroutine()
 			go gcBgMarkWorker(p)
@@ -1751,6 +1819,7 @@ func gcBgMarkWorker(_p_ *p) {
 		}
 
 		startTime := nanotime()
+		_p_.gcMarkWorkerStartTime = startTime
 
 		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
@@ -1792,7 +1861,7 @@ func gcBgMarkWorker(_p_ *p) {
 				// without preemption.
 				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
-				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			case gcMarkWorkerIdleMode:
 				gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			}
@@ -1817,7 +1886,7 @@ func gcBgMarkWorker(_p_ *p) {
 			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
 			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
-			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+			atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
 		case gcMarkWorkerIdleMode:
 			atomic.Xaddint64(&gcController.idleMarkTime, duration)
 		}
@@ -1915,10 +1984,6 @@ func gcMark(start_time int64) {
 		work.helperDrainBlock = true
 	}
 
-	if trace.enabled {
-		traceGCScanStart()
-	}
-
 	if work.nproc > 1 {
 		noteclear(&work.alldone)
 		helpgc(int32(work.nproc))
@@ -1952,8 +2017,8 @@ func gcMark(start_time int64) {
 
 	// Double-check that all gcWork caches are empty. This should
 	// be ensured by mark 2 before we enter mark termination.
-	for i := 0; i < int(gomaxprocs); i++ {
-		gcw := &allp[i].gcw
+	for _, p := range allp {
+		gcw := &p.gcw
 		if !gcw.empty() {
 			throw("P has cached GC work at end of mark termination")
 		}
@@ -1962,10 +2027,6 @@ func gcMark(start_time int64) {
 		}
 	}
 
-	if trace.enabled {
-		traceGCScanDone()
-	}
-
 	cachestats()
 
 	// Update the marked heap stat.
@@ -2093,18 +2154,19 @@ func clearpools() {
 	unlock(&sched.deferlock)
 }
 
-// Timing
-
-//go:nowritebarrier
+// gchelper runs mark termination tasks on Ps other than the P
+// coordinating mark termination.
+//
+// The caller is responsible for ensuring that this has a P to run on,
+// even though it's running during STW. Because of this, it's allowed
+// to have write barriers.
+//
+//go:yeswritebarrierrec
 func gchelper() {
 	_g_ := getg()
 	_g_.m.traceback = 2
 	gchelperstart()
 
-	if trace.enabled {
-		traceGCScanStart()
-	}
-
 	// Parallel mark over GC roots and heap
 	if gcphase == _GCmarktermination {
 		gcw := &_g_.m.p.ptr().gcw
@@ -2116,10 +2178,6 @@ func gchelper() {
 		gcw.dispose()
 	}
 
-	if trace.enabled {
-		traceGCScanDone()
-	}
-
 	nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
 	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
 		notewakeup(&work.alldone)
@@ -2138,6 +2196,8 @@ func gchelperstart() {
 	}
 }
 
+// Timing
+
 // itoaDiv formats val/(10**dec) into buf.
 func itoaDiv(buf []byte, val uint64, dec int) []byte {
 	i := len(buf) - 1
diff --git a/libgo/go/runtime/mgc_gccgo.go b/libgo/go/runtime/mgc_gccgo.go
index c1fa1547adc..107a70a7898 100644
--- a/libgo/go/runtime/mgc_gccgo.go
+++ b/libgo/go/runtime/mgc_gccgo.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // gcRoot is a single GC root: a variable plus a ptrmask.
 type gcRoot struct {
@@ -85,3 +88,21 @@ func checkPreempt() {
 	gp.scanningself = false
 	mcall(gopreempt_m)
 }
+
+// gcWriteBarrier implements a write barrier. This is implemented in
+// assembly in the gc library, but there is no special advantage to
+// doing so with gccgo.
+//go:nosplit
+//go:nowritebarrier
+func gcWriteBarrier(dst *uintptr, src uintptr) {
+	buf := &getg().m.p.ptr().wbBuf
+	next := buf.next
+	np := next + 2*sys.PtrSize
+	buf.next = np
+	*(*uintptr)(unsafe.Pointer(next)) = src
+	*(*uintptr)(unsafe.Pointer(next + sys.PtrSize)) = *dst
+	if np >= buf.end {
+		wbBufFlush(dst, src)
+	}
+	*dst = src
+}
diff --git a/libgo/go/runtime/mgclarge.go b/libgo/go/runtime/mgclarge.go
index 757e88d1d9d..fe437bf5e84 100644
--- a/libgo/go/runtime/mgclarge.go
+++ b/libgo/go/runtime/mgclarge.go
@@ -164,11 +164,10 @@ func (root *mTreap) insert(span *mspan) {
 	}
 }
 
-func (root *mTreap) removeNode(t *treapNode) *mspan {
+func (root *mTreap) removeNode(t *treapNode) {
 	if t.spanKey.npages != t.npagesKey {
 		throw("span and treap node npages do not match")
 	}
-	result := t.spanKey
 
 	// Rotate t down to be leaf of tree for removal, respecting priorities.
 	for t.right != nil || t.left != nil {
@@ -192,7 +191,6 @@ func (root *mTreap) removeNode(t *treapNode) *mspan {
 	t.spanKey = nil
 	t.npagesKey = 0
 	mheap_.treapalloc.free(unsafe.Pointer(t))
-	return result
 }
 
 // remove searches for, finds, removes from the treap, and returns the smallest
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index 998a830caa8..7297fcb6d1a 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -34,13 +34,13 @@ const (
 	// span base.
 	maxObletBytes = 128 << 10
 
-	// idleCheckThreshold specifies how many units of work to do
-	// between run queue checks in an idle worker. Assuming a scan
+	// drainCheckThreshold specifies how many units of work to do
+	// between self-preemption checks in gcDrain. Assuming a scan
 	// rate of 1 MB/ms, this is ~100 µs. Lower values have higher
 	// overhead in the scan loop (the scheduler check may perform
 	// a syscall, so its overhead is nontrivial). Higher values
 	// make the system less responsive to incoming work.
-	idleCheckThreshold = 100000
+	drainCheckThreshold = 100000
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
@@ -717,6 +717,7 @@ const (
 	gcDrainNoBlock
 	gcDrainFlushBgCredit
 	gcDrainIdle
+	gcDrainFractional
 
 	// gcDrainBlock means neither gcDrainUntilPreempt or
 	// gcDrainNoBlock. It is the default, but callers should use
@@ -733,6 +734,10 @@ const (
 // If flags&gcDrainIdle != 0, gcDrain returns when there is other work
 // to do. This implies gcDrainNoBlock.
 //
+// If flags&gcDrainFractional != 0, gcDrain self-preempts when
+// pollFractionalWorkerExit() returns true. This implies
+// gcDrainNoBlock.
+//
 // If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
 // unable to get more work. Otherwise, it will block until all
 // blocking calls are blocked in gcDrain.
@@ -749,14 +754,24 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 	gp := getg().m.curg
 	preemptible := flags&gcDrainUntilPreempt != 0
-	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0
+	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainFractional|gcDrainNoBlock) == 0
 	flushBgCredit := flags&gcDrainFlushBgCredit != 0
 	idle := flags&gcDrainIdle != 0
 
 	initScanWork := gcw.scanWork
-	// idleCheck is the scan work at which to perform the next
-	// idle check with the scheduler.
-	idleCheck := initScanWork + idleCheckThreshold
+
+	// checkWork is the scan work before performing the next
+	// self-preempt check.
+	checkWork := int64(1<<63 - 1)
+	var check func() bool
+	if flags&(gcDrainIdle|gcDrainFractional) != 0 {
+		checkWork = initScanWork + drainCheckThreshold
+		if idle {
+			check = pollWork
+		} else if flags&gcDrainFractional != 0 {
+			check = pollFractionalWorkerExit
+		}
+	}
 
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
@@ -766,7 +781,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 				break
 			}
 			markroot(gcw, job)
-			if idle && pollWork() {
+			if check != nil && check() {
 				goto done
 			}
 		}
@@ -807,12 +822,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 				gcFlushBgCredit(gcw.scanWork - initScanWork)
 				initScanWork = 0
 			}
-			idleCheck -= gcw.scanWork
+			checkWork -= gcw.scanWork
 			gcw.scanWork = 0
 
-			if idle && idleCheck <= 0 {
-				idleCheck += idleCheckThreshold
-				if pollWork() {
+			if checkWork <= 0 {
+				checkWork += drainCheckThreshold
+				if check != nil && check() {
 					break
 				}
 			}
@@ -1091,6 +1106,9 @@ func shade(b uintptr) {
 // obj is the start of an object with mark mbits.
 // If it isn't already marked, mark it and enqueue into gcw.
 // base and off are for debugging only and could be removed.
+//
+// See also wbBufFlush1, which partially duplicates this logic.
+//
 //go:nowritebarrierrec
 func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr, forStack bool) {
 	// obj should be start of allocation, and so must be at least pointer-aligned.
@@ -1249,10 +1267,7 @@ func gcmarknewobject(obj, size, scanSize uintptr) {
 //
 // The world must be stopped.
 func gcMarkTinyAllocs() {
-	for _, p := range &allp {
-		if p == nil || p.status == _Pdead {
-			break
-		}
+	for _, p := range allp {
 		c := p.mcache
 		if c == nil || c.tiny == 0 {
 			continue
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index 461679b9343..c6634fc78ca 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -85,6 +85,13 @@ type gcWork struct {
 	scanWork int64
 }
 
+// Most of the methods of gcWork are go:nowritebarrierrec because the
+// write barrier itself can invoke gcWork methods but the methods are
+// not generally re-entrant. Hence, if a gcWork method invoked the
+// write barrier while the gcWork was in an inconsistent state, and
+// the write barrier in turn invoked a gcWork method, it could
+// permanently corrupt the gcWork.
+
 func (w *gcWork) init() {
 	w.wbuf1 = getempty()
 	wbuf2 := trygetfull()
@@ -96,7 +103,7 @@ func (w *gcWork) init() {
 
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
 	flushed := false
 	wbuf := w.wbuf1
@@ -129,7 +136,7 @@ func (w *gcWork) put(obj uintptr) {
 
 // putFast does a put and returns true if it can be done quickly
 // otherwise it returns false and the caller needs to call put.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -143,12 +150,45 @@ func (w *gcWork) putFast(obj uintptr) bool {
 	return true
 }
 
+// putBatch performs a put on every pointer in obj. See put for
+// constraints on these pointers.
+//
+//go:nowritebarrierrec
+func (w *gcWork) putBatch(obj []uintptr) {
+	if len(obj) == 0 {
+		return
+	}
+
+	flushed := false
+	wbuf := w.wbuf1
+	if wbuf == nil {
+		w.init()
+		wbuf = w.wbuf1
+	}
+
+	for len(obj) > 0 {
+		for wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
+			wbuf = w.wbuf1
+			flushed = true
+		}
+		n := copy(wbuf.obj[wbuf.nobj:], obj)
+		wbuf.nobj += n
+		obj = obj[n:]
+	}
+
+	if flushed && gcphase == _GCmark {
+		gcController.enlistWorker()
+	}
+}
+
 // tryGet dequeues a pointer for the garbage collector to trace.
 //
 // If there are no pointers remaining in this gcWork or in the global
 // queue, tryGet returns 0.  Note that there may still be pointers in
 // other gcWork instances or other caches.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) tryGet() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -177,7 +217,7 @@ func (w *gcWork) tryGet() uintptr {
 // tryGetFast dequeues a pointer for the garbage collector to trace
 // if one is readily available. Otherwise it returns 0 and
 // the caller is expected to call tryGet().
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) tryGetFast() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -194,7 +234,7 @@ func (w *gcWork) tryGetFast() uintptr {
 // get dequeues a pointer for the garbage collector to trace, blocking
 // if necessary to ensure all pointers from all queues and caches have
 // been retrieved.  get returns 0 if there are no pointers remaining.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) get() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -228,7 +268,7 @@ func (w *gcWork) get() uintptr {
 // GC can inspect them. This helps reduce the mutator's
 // ability to hide pointers during the concurrent mark phase.
 //
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) dispose() {
 	if wbuf := w.wbuf1; wbuf != nil {
 		if wbuf.nobj == 0 {
@@ -262,7 +302,7 @@ func (w *gcWork) dispose() {
 
 // balance moves some work that's cached in this gcWork back on the
 // global queue.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) balance() {
 	if w.wbuf1 == nil {
 		return
@@ -282,7 +322,7 @@ func (w *gcWork) balance() {
 }
 
 // empty returns true if w has no mark work available.
-//go:nowritebarrier
+//go:nowritebarrierrec
 func (w *gcWork) empty() bool {
 	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
 }
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index 8749f971065..d971bfee4df 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -56,6 +56,12 @@ type mheap struct {
 	// Internal pages map to an arbitrary span.
 	// For pages that have never been allocated, spans entries are nil.
 	//
+	// Modifications are protected by mheap.lock. Reads can be
+	// performed without locking, but ONLY from indexes that are
+	// known to contain in-use or stack spans. This means there
+	// must not be a safe-point between establishing that an
+	// address is live and looking it up in the spans array.
+	//
 	// This is backed by a reserved region of the address space so
 	// it can grow without moving. The memory up to len(spans) is
 	// mapped. cap(spans) indicates the total reserved memory.
@@ -154,6 +160,8 @@ type mheap struct {
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
+
+	unused *specialfinalizer // never set, just here to force the specialfinalizer type into DWARF
 }
 
 var mheap_ mheap
@@ -311,6 +319,17 @@ func (s *mspan) layout() (size, n, total uintptr) {
 	return
 }
 
+// recordspan adds a newly allocated span to h.allspans.
+//
+// This only happens the first time a span is allocated from
+// mheap.spanalloc (it is not called when a span is reused).
+//
+// Write barriers are disallowed here because it can be called from
+// gcWork when allocating new workbufs. However, because it's an
+// indirect call from the fixalloc initializer, the compiler can't see
+// this.
+//
+//go:nowritebarrierrec
 func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
@@ -320,8 +339,8 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 			n = cap(h.allspans) * 3 / 2
 		}
 		var new []*mspan
-		sp := (*slice)(unsafe.Pointer(&new))
-		sp.array = sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys)
+		sp := (*notInHeapSlice)(unsafe.Pointer(&new))
+		sp.array = (*notInHeap)(sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys))
 		if sp.array == nil {
 			throw("runtime: cannot allocate memory")
 		}
@@ -331,12 +350,13 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 			copy(new, h.allspans)
 		}
 		oldAllspans := h.allspans
-		h.allspans = new
+		*(*notInHeapSlice)(unsafe.Pointer(&h.allspans)) = *(*notInHeapSlice)(unsafe.Pointer(&new))
 		if len(oldAllspans) != 0 {
 			sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
 		}
 	}
-	h.allspans = append(h.allspans, s)
+	h.allspans = h.allspans[:len(h.allspans)+1]
+	h.allspans[len(h.allspans)-1] = s
 }
 
 // A spanClass represents the size class and noscan-ness of a span.
@@ -854,7 +874,7 @@ HaveSpan:
 // Large spans have a minimum size of 1MByte. The maximum number of large spans to support
 // 1TBytes is 1 million, experimentation using random sizes indicates that the depth of
 // the tree is less that 2x that of a perfectly balanced tree. For 1TByte can be referenced
-// by a perfectly balanced tree with a a depth of 20. Twice that is an acceptable 40.
+// by a perfectly balanced tree with a depth of 20. Twice that is an acceptable 40.
 func (h *mheap) isLargeSpan(npages uintptr) bool {
 	return npages >= uintptr(len(h.free))
 }
@@ -1120,34 +1140,35 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 
 	var sumreleased uintptr
 	for s := list.first; s != nil; s = s.next {
-		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
-			start := s.base()
-			end := start + s.npages<<_PageShift
-			if physPageSize > _PageSize {
-				// We can only release pages in
-				// physPageSize blocks, so round start
-				// and end in. (Otherwise, madvise
-				// will round them *out* and release
-				// more memory than we want.)
-				start = (start + physPageSize - 1) &^ (physPageSize - 1)
-				end &^= physPageSize - 1
-				if end <= start {
-					// start and end don't span a
-					// whole physical page.
-					continue
-				}
-			}
-			len := end - start
-
-			released := len - (s.npreleased << _PageShift)
-			if physPageSize > _PageSize && released == 0 {
+		if (now-uint64(s.unusedsince)) <= limit || s.npreleased == s.npages {
+			continue
+		}
+		start := s.base()
+		end := start + s.npages<<_PageShift
+		if physPageSize > _PageSize {
+			// We can only release pages in
+			// physPageSize blocks, so round start
+			// and end in. (Otherwise, madvise
+			// will round them *out* and release
+			// more memory than we want.)
+			start = (start + physPageSize - 1) &^ (physPageSize - 1)
+			end &^= physPageSize - 1
+			if end <= start {
+				// start and end don't span a
+				// whole physical page.
 				continue
 			}
-			memstats.heap_released += uint64(released)
-			sumreleased += released
-			s.npreleased = len >> _PageShift
-			sysUnused(unsafe.Pointer(start), len)
 		}
+		len := end - start
+
+		released := len - (s.npreleased << _PageShift)
+		if physPageSize > _PageSize && released == 0 {
+			continue
+		}
+		memstats.heap_released += uint64(released)
+		sumreleased += released
+		s.npreleased = len >> _PageShift
+		sysUnused(unsafe.Pointer(start), len)
 	}
 	return sumreleased
 }
diff --git a/libgo/go/runtime/mksizeclasses.go b/libgo/go/runtime/mksizeclasses.go
index 0cb2b33a8cd..b146dbcd6c9 100644
--- a/libgo/go/runtime/mksizeclasses.go
+++ b/libgo/go/runtime/mksizeclasses.go
@@ -24,8 +24,8 @@
 // In practice, only one of the wastes comes into play for a
 // given size (sizes < 512 waste mainly on the round-up,
 // sizes > 512 waste mainly on the page chopping).
-//
-// TODO(rsc): Compute max waste for any given size.
+// For really small sizes, alignment constraints force the
+// overhead higher.
 
 package main
 
@@ -242,15 +242,18 @@ nextk:
 }
 
 func printComment(w io.Writer, classes []class) {
-	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-11s\n", "class", "bytes/obj", "bytes/span", "objects", "waste bytes")
+	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste")
+	prevSize := 0
 	for i, c := range classes {
 		if i == 0 {
 			continue
 		}
 		spanSize := c.npages * pageSize
 		objects := spanSize / c.size
-		waste := spanSize - c.size*(spanSize/c.size)
-		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %11d\n", i, c.size, spanSize, objects, waste)
+		tailWaste := spanSize - c.size*(spanSize/c.size)
+		maxWaste := float64((c.size-prevSize-1)*objects+tailWaste) / float64(spanSize)
+		prevSize = c.size
+		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste)
 	}
 	fmt.Fprintf(w, "\n")
 }
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 71dc2239854..22f5195cd58 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -589,12 +589,13 @@ func updatememstats() {
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
+// cachestats flushes all mcache stats.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func cachestats() {
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
@@ -610,9 +611,6 @@ func cachestats() {
 //go:nowritebarrier
 func flushmcache(i int) {
 	p := allp[i]
-	if p == nil {
-		return
-	}
 	c := p.mcache
 	if c == nil {
 		return
@@ -665,7 +663,7 @@ func purgecachedstats(c *mcache) {
 // overflow errors.
 //go:nosplit
 func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		atomic.Xadd64(sysStat, int64(n))
 		return
 	}
@@ -679,7 +677,7 @@ func mSysStatInc(sysStat *uint64, n uintptr) {
 // mSysStatInc apply.
 //go:nosplit
 func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sys.BigEndian != 0 {
+	if sys.BigEndian {
 		atomic.Xadd64(sysStat, -int64(n))
 		return
 	}
diff --git a/libgo/go/runtime/mwbbuf.go b/libgo/go/runtime/mwbbuf.go
new file mode 100644
index 00000000000..a060df8bc06
--- /dev/null
+++ b/libgo/go/runtime/mwbbuf.go
@@ -0,0 +1,248 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This implements the write barrier buffer. The write barrier itself
+// is gcWriteBarrier and is implemented in assembly.
+//
+// The write barrier has a fast path and a slow path. The fast path
+// simply enqueues to a per-P write barrier buffer. It's written in
+// assembly and doesn't clobber any general purpose registers, so it
+// doesn't have the usual overheads of a Go call.
+//
+// When the buffer fills up, the write barrier invokes the slow path
+// (wbBufFlush) to flush the buffer to the GC work queues. In this
+// path, since the compiler didn't spill registers, we spill *all*
+// registers and disallow any GC safe points that could observe the
+// stack frame (since we don't know the types of the spilled
+// registers).
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// testSmallBuf forces a small write barrier buffer to stress write
+// barrier flushing.
+const testSmallBuf = false
+
+// wbBuf is a per-P buffer of pointers queued by the write barrier.
+// This buffer is flushed to the GC workbufs when it fills up and on
+// various GC transitions.
+//
+// This is closely related to a "sequential store buffer" (SSB),
+// except that SSBs are usually used for maintaining remembered sets,
+// while this is used for marking.
+type wbBuf struct {
+	// next points to the next slot in buf. It must not be a
+	// pointer type because it can point past the end of buf and
+	// must be updated without write barriers.
+	//
+	// This is a pointer rather than an index to optimize the
+	// write barrier assembly.
+	next uintptr
+
+	// end points to just past the end of buf. It must not be a
+	// pointer type because it points past the end of buf and must
+	// be updated without write barriers.
+	end uintptr
+
+	// buf stores a series of pointers to execute write barriers
+	// on. This must be a multiple of wbBufEntryPointers because
+	// the write barrier only checks for overflow once per entry.
+	buf [wbBufEntryPointers * wbBufEntries]uintptr
+}
+
+const (
+	// wbBufEntries is the number of write barriers between
+	// flushes of the write barrier buffer.
+	//
+	// This trades latency for throughput amortization. Higher
+	// values amortize flushing overhead more, but increase the
+	// latency of flushing. Higher values also increase the cache
+	// footprint of the buffer.
+	//
+	// TODO: What is the latency cost of this? Tune this value.
+	wbBufEntries = 256
+
+	// wbBufEntryPointers is the number of pointers added to the
+	// buffer by each write barrier.
+	wbBufEntryPointers = 2
+)
+
+// reset empties b by resetting its next and end pointers.
+func (b *wbBuf) reset() {
+	start := uintptr(unsafe.Pointer(&b.buf[0]))
+	b.next = start
+	if gcBlackenPromptly || writeBarrier.cgo {
+		// Effectively disable the buffer by forcing a flush
+		// on every barrier.
+		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+	} else if testSmallBuf {
+		// For testing, allow two barriers in the buffer. If
+		// we only did one, then barriers of non-heap pointers
+		// would be no-ops. This lets us combine a buffered
+		// barrier with a flush at a later time.
+		b.end = uintptr(unsafe.Pointer(&b.buf[2*wbBufEntryPointers]))
+	} else {
+		b.end = start + uintptr(len(b.buf))*unsafe.Sizeof(b.buf[0])
+	}
+
+	if (b.end-b.next)%(wbBufEntryPointers*unsafe.Sizeof(b.buf[0])) != 0 {
+		throw("bad write barrier buffer bounds")
+	}
+}
+
+// putFast adds old and new to the write barrier buffer and returns
+// false if a flush is necessary. Callers should use this as:
+//
+//     buf := &getg().m.p.ptr().wbBuf
+//     if !buf.putFast(old, new) {
+//         wbBufFlush(...)
+//     }
+//
+// The arguments to wbBufFlush depend on whether the caller is doing
+// its own cgo pointer checks. If it is, then this can be
+// wbBufFlush(nil, 0). Otherwise, it must pass the slot address and
+// new.
+//
+// Since buf is a per-P resource, the caller must ensure there are no
+// preemption points while buf is in use.
+//
+// It must be nowritebarrierrec to because write barriers here would
+// corrupt the write barrier buffer. It (and everything it calls, if
+// it called anything) has to be nosplit to avoid scheduling on to a
+// different P and a different buffer.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func (b *wbBuf) putFast(old, new uintptr) bool {
+	p := (*[2]uintptr)(unsafe.Pointer(b.next))
+	p[0] = old
+	p[1] = new
+	b.next += 2 * sys.PtrSize
+	return b.next != b.end
+}
+
+// wbBufFlush flushes the current P's write barrier buffer to the GC
+// workbufs. It is passed the slot and value of the write barrier that
+// caused the flush so that it can implement cgocheck.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation.
+//
+// This and everything it calls must be nosplit because 1) the stack
+// contains untyped slots from gcWriteBarrier and 2) there must not be
+// a GC safe point between the write barrier test in the caller and
+// flushing the buffer.
+//
+// TODO: A "go:nosplitrec" annotation would be perfect for this.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func wbBufFlush(dst *uintptr, src uintptr) {
+	if getg().m.dying > 0 {
+		// We're going down. Not much point in write barriers
+		// and this way we can allow write barriers in the
+		// panic path.
+		return
+	}
+
+	if writeBarrier.cgo && dst != nil {
+		// This must be called from the stack that did the
+		// write. It's nosplit all the way down.
+		cgoCheckWriteBarrier(dst, src)
+		if !writeBarrier.needed {
+			// We were only called for cgocheck.
+			b := &getg().m.p.ptr().wbBuf
+			b.next = uintptr(unsafe.Pointer(&b.buf[0]))
+			return
+		}
+	}
+
+	// Switch to the system stack so we don't have to worry about
+	// the untyped stack slots or safe points.
+	systemstack(func() {
+		wbBufFlush1(getg().m.p.ptr())
+	})
+}
+
+// wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
+//
+// This must not have write barriers because it is part of the write
+// barrier implementation, so this may lead to infinite loops or
+// buffer corruption.
+//
+// This must be non-preemptible because it uses the P's workbuf.
+//
+//go:nowritebarrierrec
+//go:systemstack
+func wbBufFlush1(_p_ *p) {
+	// Get the buffered pointers.
+	start := uintptr(unsafe.Pointer(&_p_.wbBuf.buf[0]))
+	n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
+	ptrs := _p_.wbBuf.buf[:n]
+
+	// Reset the buffer.
+	_p_.wbBuf.reset()
+
+	if useCheckmark {
+		// Slow path for checkmark mode.
+		for _, ptr := range ptrs {
+			shade(ptr)
+		}
+		return
+	}
+
+	// Mark all of the pointers in the buffer and record only the
+	// pointers we greyed. We use the buffer itself to temporarily
+	// record greyed pointers.
+	//
+	// TODO: Should scanobject/scanblock just stuff pointers into
+	// the wbBuf? Then this would become the sole greying path.
+	gcw := &_p_.gcw
+	pos := 0
+	arenaStart := mheap_.arena_start
+	for _, ptr := range ptrs {
+		if ptr < arenaStart {
+			// nil pointers are very common, especially
+			// for the "old" values. Filter out these and
+			// other "obvious" non-heap pointers ASAP.
+			//
+			// TODO: Should we filter out nils in the fast
+			// path to reduce the rate of flushes?
+			continue
+		}
+		// TODO: This doesn't use hbits, so calling
+		// heapBitsForObject seems a little silly. We could
+		// easily separate this out since heapBitsForObject
+		// just calls heapBitsForAddr(obj) to get hbits.
+		obj, _, span, objIndex := heapBitsForObject(ptr, 0, 0, false)
+		if obj == 0 {
+			continue
+		}
+		// TODO: Consider making two passes where the first
+		// just prefetches the mark bits.
+		mbits := span.markBitsForIndex(objIndex)
+		if mbits.isMarked() {
+			continue
+		}
+		mbits.setMarked()
+		if span.spanclass.noscan() {
+			gcw.bytesMarked += uint64(span.elemsize)
+			continue
+		}
+		ptrs[pos] = obj
+		pos++
+	}
+
+	// Enqueue the greyed objects.
+	gcw.putBatch(ptrs[:pos])
+	if gcphase == _GCmarktermination || gcBlackenPromptly {
+		// Ps aren't allowed to cache work during mark
+		// termination.
+		gcw.dispose()
+	}
+}
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index 47927fe7c37..1f68effbf9d 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -97,10 +97,23 @@ retry:
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 		var mode int32
-		if ev.filter == _EVFILT_READ {
+		switch ev.filter {
+		case _EVFILT_READ:
 			mode += 'r'
-		}
-		if ev.filter == _EVFILT_WRITE {
+
+			// On some systems when the read end of a pipe
+			// is closed the write end will not get a
+			// _EVFILT_WRITE event, but will get a
+			// _EVFILT_READ event with EV_EOF set.
+			// Note that setting 'w' here just means that we
+			// will wake up a goroutine waiting to write;
+			// that goroutine will try the write again,
+			// and the appropriate thing will happen based
+			// on what that write returns (success, EPIPE, EAGAIN).
+			if ev.flags&_EV_EOF != 0 {
+				mode += 'w'
+			}
+		case _EVFILT_WRITE:
 			mode += 'w'
 		}
 		if mode != 0 {
diff --git a/libgo/go/runtime/netpoll_windows.go b/libgo/go/runtime/netpoll_windows.go
index 79dafb02796..134071f5e3c 100644
--- a/libgo/go/runtime/netpoll_windows.go
+++ b/libgo/go/runtime/netpoll_windows.go
@@ -47,7 +47,7 @@ func netpolldescriptor() uintptr {
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	if stdcall4(_CreateIoCompletionPort, fd, iocphandle, 0, 0) == 0 {
-		return -int32(getlasterror())
+		return int32(getlasterror())
 	}
 	return 0
 }
diff --git a/libgo/go/runtime/os_freebsd.go b/libgo/go/runtime/os_freebsd.go
index a4d2886d6af..8c3535b893b 100644
--- a/libgo/go/runtime/os_freebsd.go
+++ b/libgo/go/runtime/os_freebsd.go
@@ -16,6 +16,17 @@ type mOS struct {
 //extern _umtx_op
 func sys_umtx_op(addr *uint32, mode int32, val uint32, uaddr1 uinptr, ts *umtx_time) int32
 
+func getPageSize() uintptr {
+	mib := [2]uint32{_CTL_HW, _HW_PAGESIZE}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return uintptr(out)
+	}
+	return 0
+}
+
 // FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
 // thus the code is largely similar. See Linux implementation
 // and lock_futex.go for comments.
diff --git a/libgo/go/runtime/os_linux.go b/libgo/go/runtime/os_linux.go
index e1a6a308cf2..816327e70b8 100644
--- a/libgo/go/runtime/os_linux.go
+++ b/libgo/go/runtime/os_linux.go
@@ -106,45 +106,46 @@ func sysargs(argc int32, argv **byte) {
 
 	// now argv+n is auxv
 	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
-	if sysauxv(auxv[:]) == 0 {
-		// In some situations we don't get a loader-provided
-		// auxv, such as when loaded as a library on Android.
-		// Fall back to /proc/self/auxv.
-		fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
-		if fd < 0 {
-			// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
-			// try using mincore to detect the physical page size.
-			// mincore should return EINVAL when address is not a multiple of system page size.
-			const size = 256 << 10 // size of memory region to allocate
-			p := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
-			if uintptr(p) < 4096 {
-				return
-			}
-			var n uintptr
-			for n = 4 << 10; n < size; n <<= 1 {
-				err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
-				if err == 0 {
-					physPageSize = n
-					break
-				}
-			}
-			if physPageSize == 0 {
-				physPageSize = size
-			}
-			munmap(p, size)
+	if sysauxv(auxv[:]) != 0 {
+		return
+	}
+	// In some situations we don't get a loader-provided
+	// auxv, such as when loaded as a library on Android.
+	// Fall back to /proc/self/auxv.
+	fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
+	if fd < 0 {
+		// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
+		// try using mincore to detect the physical page size.
+		// mincore should return EINVAL when address is not a multiple of system page size.
+		const size = 256 << 10 // size of memory region to allocate
+		p, err := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+		if err != 0 {
 			return
 		}
-		var buf [128]uintptr
-		n := read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
-		closefd(fd)
-		if n < 0 {
-			return
+		var n uintptr
+		for n = 4 << 10; n < size; n <<= 1 {
+			err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
+			if err == 0 {
+				physPageSize = n
+				break
+			}
+		}
+		if physPageSize == 0 {
+			physPageSize = size
 		}
-		// Make sure buf is terminated, even if we didn't read
-		// the whole file.
-		buf[len(buf)-2] = _AT_NULL
-		sysauxv(buf[:])
+		munmap(p, size)
+		return
+	}
+	var buf [128]uintptr
+	n = read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
+	closefd(fd)
+	if n < 0 {
+		return
 	}
+	// Make sure buf is terminated, even if we didn't read
+	// the whole file.
+	buf[len(buf)-2] = _AT_NULL
+	sysauxv(buf[:])
 }
 
 func sysauxv(auxv []uintptr) int {
diff --git a/libgo/go/runtime/os_linux_ppc64x.go b/libgo/go/runtime/os_linux_ppc64x.go
index b324344493e..d27902d794d 100644
--- a/libgo/go/runtime/os_linux_ppc64x.go
+++ b/libgo/go/runtime/os_linux_ppc64x.go
@@ -7,55 +7,22 @@
 
 package runtime
 
-import (
-	"runtime/internal/sys"
-)
+// For go:linkname
+import _ "unsafe"
 
-const (
-	// ISA level
-	// Go currently requires POWER5 as a minimum for ppc64, so we need
-	// to check for ISA 2.03 and beyond.
-	_PPC_FEATURE_POWER5_PLUS = 0x00020000 // ISA 2.03 (POWER5+)
-	_PPC_FEATURE_ARCH_2_05   = 0x00001000 // ISA 2.05 (POWER6)
-	_PPC_FEATURE_POWER6_EXT  = 0x00000200 // mffgpr/mftgpr extension (POWER6x)
-	_PPC_FEATURE_ARCH_2_06   = 0x00000100 // ISA 2.06 (POWER7)
-	_PPC_FEATURE2_ARCH_2_07  = 0x80000000 // ISA 2.07 (POWER8)
+// ppc64x doesn't have a 'cpuid' instruction equivalent and relies on
+// HWCAP/HWCAP2 bits for hardware capabilities.
 
-	// Standalone capabilities
-	_PPC_FEATURE_HAS_ALTIVEC = 0x10000000 // SIMD/Vector unit
-	_PPC_FEATURE_HAS_VSX     = 0x00000080 // Vector scalar unit
-)
-
-type facilities struct {
-	_         [sys.CacheLineSize]byte
-	isPOWER5x bool // ISA 2.03
-	isPOWER6  bool // ISA 2.05
-	isPOWER6x bool // ISA 2.05 + mffgpr/mftgpr extension
-	isPOWER7  bool // ISA 2.06
-	isPOWER8  bool // ISA 2.07
-	hasVMX    bool // Vector unit
-	hasVSX    bool // Vector scalar unit
-	_         [sys.CacheLineSize]byte
-}
-
-// cpu can be tested at runtime in go assembler code to check for
-// a certain ISA level or hardware capability, for example:
-//	  ·cpu+facilities_hasVSX(SB) for checking the availability of VSX
-//	  or
-//	  ·cpu+facilities_isPOWER7(SB) for checking if the processor implements
-//	  ISA 2.06 instructions.
-var cpu facilities
+//go:linkname cpu_hwcap internal/cpu.ppc64x_hwcap
+//go:linkname cpu_hwcap2 internal/cpu.ppc64x_hwcap2
+var cpu_hwcap uint
+var cpu_hwcap2 uint
 
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		cpu.isPOWER5x = val&_PPC_FEATURE_POWER5_PLUS != 0
-		cpu.isPOWER6 = val&_PPC_FEATURE_ARCH_2_05 != 0
-		cpu.isPOWER6x = val&_PPC_FEATURE_POWER6_EXT != 0
-		cpu.isPOWER7 = val&_PPC_FEATURE_ARCH_2_06 != 0
-		cpu.hasVMX = val&_PPC_FEATURE_HAS_ALTIVEC != 0
-		cpu.hasVSX = val&_PPC_FEATURE_HAS_VSX != 0
+		cpu_hwcap = uint(val)
 	case _AT_HWCAP2:
-		cpu.isPOWER8 = val&_PPC_FEATURE2_ARCH_2_07 != 0
+		cpu_hwcap2 = uint(val)
 	}
 }
diff --git a/libgo/go/runtime/os_netbsd.go b/libgo/go/runtime/os_netbsd.go
index 464ce88d9c4..81ebe7636a1 100644
--- a/libgo/go/runtime/os_netbsd.go
+++ b/libgo/go/runtime/os_netbsd.go
@@ -15,7 +15,7 @@ type mOS struct {
 
 //go:noescape
 //extern lwp_park
-func lwp_park(abstime *timespec, unpark int32, hint, unparkhint unsafe.Pointer) int32
+func lwp_park(ts int32, rel int32, abstime *timespec, unpark int32, hint, unparkhint unsafe.Pointer) int32
 
 //go:noescape
 //extern lwp_unpark
@@ -31,10 +31,9 @@ func semasleep(ns int64) int32 {
 
 	// Compute sleep deadline.
 	var tsp *timespec
+	var ts timespec
 	if ns >= 0 {
-		var ts timespec
 		var nsec int32
-		ns += nanotime()
 		ts.set_sec(int64(timediv(ns, 1000000000, &nsec)))
 		ts.set_nsec(nsec)
 		tsp = &ts
@@ -50,9 +49,18 @@ func semasleep(ns int64) int32 {
 		}
 
 		// Sleep until unparked by semawakeup or timeout.
-		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.mos.waitsemacount), nil)
+		ret := lwp_park(_CLOCK_MONOTONIC, _TIMER_RELTIME, tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
 		if ret == _ETIMEDOUT {
 			return -1
+		} else if ret == _EINTR && ns >= 0 {
+			// Avoid sleeping forever if we keep getting
+			// interrupted (for example by the profiling
+			// timer). It would be if tsp upon return had the
+			// remaining time to sleep, but this is good enough.
+			var nsec int32
+			ns /= 2
+			ts.set_sec(timediv(ns, 1000000000, &nsec))
+			ts.set_nsec(nsec)
 		}
 	}
 }
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index c39a58d0c4b..5cc325f3954 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -170,7 +170,18 @@ func freedefer(d *_defer) {
 			unlock(&sched.deferlock)
 		})
 	}
-	*d = _defer{}
+
+	// These lines used to be simply `*d = _defer{}` but that
+	// started causing a nosplit stack overflow via typedmemmove.
+	d.link = nil
+	d.frame = nil
+	d.panicStack = nil
+	d._panic = nil
+	d.pfn = 0
+	d.arg = nil
+	d.retaddr = 0
+	d.makefunccanrecover = false
+
 	pp.deferpool = append(pp.deferpool, d)
 }
 
@@ -327,7 +338,7 @@ func unwindStack() {
 
 // Goexit terminates the goroutine that calls it. No other goroutine is affected.
 // Goexit runs all deferred calls before terminating the goroutine. Because Goexit
-// is not panic, however, any recover calls in those deferred functions will return nil.
+// is not a panic, any recover calls in those deferred functions will return nil.
 //
 // Calling Goexit from the main goroutine terminates that goroutine
 // without func main returning. Since func main has not returned,
@@ -599,7 +610,7 @@ func canrecover(retaddr uintptr) bool {
 	// caller starts with "runtime.", then we are permitted to
 	// call recover.
 	var locs [16]location
-	if callers(2, locs[:2]) < 2 {
+	if callers(1, locs[:2]) < 2 {
 		return false
 	}
 
@@ -619,7 +630,7 @@ func canrecover(retaddr uintptr) bool {
 	// reflect.makeFuncStub or reflect.ffi_callback called by FFI
 	// functions.  Then we check the caller of that function.
 
-	n := callers(3, locs[:])
+	n := callers(2, locs[:])
 	foundFFICallback := false
 	i := 0
 	for ; i < n; i++ {
@@ -822,6 +833,12 @@ var panicking uint32
 // so that two concurrent panics don't overlap their output.
 var paniclk mutex
 
+// startpanic_m implements unrecoverable panic.
+//
+// It can have write barriers because the write barrier explicitly
+// ignores writes once dying > 0.
+//
+//go:yeswritebarrierrec
 func startpanic() {
 	_g_ := getg()
 	// Uncomment when mheap_ is in Go.
@@ -860,7 +877,7 @@ func startpanic() {
 		exit(4)
 		fallthrough
 	default:
-		// Can't even print!  Just exit.
+		// Can't even print! Just exit.
 		exit(5)
 	}
 }
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index a57b69dca35..8a562e2ce8b 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -18,7 +18,7 @@
 // To add equivalent profiling support to a standalone program, add
 // code like the following to your main function:
 //
-//    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile `file`")
+//    var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to `file`")
 //    var memprofile = flag.String("memprofile", "", "write memory profile to `file`")
 //
 //    func main() {
@@ -319,7 +319,15 @@ func (p *Profile) WriteTo(w io.Writer, debug int) error {
 	p.mu.Unlock()
 
 	// Map order is non-deterministic; make output deterministic.
-	sort.Sort(stackProfile(all))
+	sort.Slice(all, func(i, j int) bool {
+		t, u := all[i], all[j]
+		for k := 0; k < len(t) && k < len(u); k++ {
+			if t[k] != u[k] {
+				return t[k] < u[k]
+			}
+		}
+		return len(t) < len(u)
+	})
 
 	return printCountProfile(w, debug, p.name, stackProfile(all))
 }
@@ -328,16 +336,6 @@ type stackProfile [][]uintptr
 
 func (x stackProfile) Len() int              { return len(x) }
 func (x stackProfile) Stack(i int) []uintptr { return x[i] }
-func (x stackProfile) Swap(i, j int)         { x[i], x[j] = x[j], x[i] }
-func (x stackProfile) Less(i, j int) bool {
-	t, u := x[i], x[j]
-	for k := 0; k < len(t) && k < len(u); k++ {
-		if t[k] != u[k] {
-			return t[k] < u[k]
-		}
-	}
-	return len(t) < len(u)
-}
 
 // A countProfile is a set of stack traces to be printed as counts
 // grouped by stack trace. There are multiple implementations:
@@ -348,6 +346,41 @@ type countProfile interface {
 	Stack(i int) []uintptr
 }
 
+// printCountCycleProfile outputs block profile records (for block or mutex profiles)
+// as the pprof-proto format output. Translations from cycle count to time duration
+// are done because The proto expects count and time (nanoseconds) instead of count
+// and the number of cycles for block, contention profiles.
+func printCountCycleProfile(w io.Writer, countName, cycleName string, records []runtime.BlockProfileRecord) error {
+	// Output profile in protobuf form.
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, countName, "count")
+	b.pb.int64Opt(tagProfile_Period, 1)
+	b.pbValueType(tagProfile_SampleType, countName, "count")
+	b.pbValueType(tagProfile_SampleType, cycleName, "nanoseconds")
+
+	cpuGHz := float64(runtime_cyclesPerSecond()) / 1e9
+
+	values := []int64{0, 0}
+	var locs []uint64
+	for _, r := range records {
+		values[0] = int64(r.Count)
+		values[1] = int64(float64(r.Cycles) / cpuGHz) // to nanoseconds
+		locs = locs[:0]
+		for _, addr := range r.Stack() {
+			// For count profiles, all stack addresses are
+			// return PCs, which is what locForPC expects.
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
+			}
+			locs = append(locs, l)
+		}
+		b.pbSample(values, locs, nil)
+	}
+	b.build()
+	return nil
+}
+
 // printCountProfile prints a countProfile at the specified debug level.
 // The profile will be in compressed proto format unless debug is nonzero.
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
@@ -441,7 +474,7 @@ func printStackRecord(w io.Writer, stk []uintptr, allFrames bool) {
 
 		// Hide runtime.goexit and any runtime functions at the beginning.
 		// This is useful mainly for allocation traces.
-		skip := name == "runtime.goexit"
+		skip := name == "runtime.goexit" || name == "runtime.kickoff"
 		if !show {
 			switch {
 			case strings.HasPrefix(name, "runtime."):
@@ -490,6 +523,14 @@ func countHeap() int {
 
 // writeHeap writes the current runtime heap profile to w.
 func writeHeap(w io.Writer, debug int) error {
+	var memStats *runtime.MemStats
+	if debug != 0 {
+		// Read mem stats first, so that our other allocations
+		// do not appear in the statistics.
+		memStats = new(runtime.MemStats)
+		runtime.ReadMemStats(memStats)
+	}
+
 	// Find out how many records there are (MemProfile(nil, true)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
@@ -552,8 +593,7 @@ func writeHeap(w io.Writer, debug int) error {
 
 	// Print memstats information too.
 	// Pprof will ignore, but useful for people
-	s := new(runtime.MemStats)
-	runtime.ReadMemStats(s)
+	s := memStats
 	fmt.Fprintf(w, "\n# runtime.MemStats\n")
 	fmt.Fprintf(w, "# Alloc = %d\n", s.Alloc)
 	fmt.Fprintf(w, "# TotalAlloc = %d\n", s.TotalAlloc)
@@ -779,14 +819,14 @@ func writeBlock(w io.Writer, debug int) error {
 
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
-	b := bufio.NewWriter(w)
-	var tw *tabwriter.Writer
-	w = b
-	if debug > 0 {
-		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-		w = tw
+	if debug <= 0 {
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
+	b := bufio.NewWriter(w)
+	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+	w = tw
+
 	fmt.Fprintf(w, "--- contention:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	for i := range p {
@@ -823,14 +863,14 @@ func writeMutex(w io.Writer, debug int) error {
 
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
-	b := bufio.NewWriter(w)
-	var tw *tabwriter.Writer
-	w = b
-	if debug > 0 {
-		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-		w = tw
+	if debug <= 0 {
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
+	b := bufio.NewWriter(w)
+	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+	w = tw
+
 	fmt.Fprintf(w, "--- mutex:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 9e5e403b741..08a4f969ca2 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -26,16 +26,18 @@ import (
 	"time"
 )
 
-func cpuHogger(f func() int, dur time.Duration) {
+func cpuHogger(f func(x int) int, y *int, dur time.Duration) {
 	// We only need to get one 100 Hz clock tick, so we've got
 	// a large safety buffer.
 	// But do at least 500 iterations (which should take about 100ms),
 	// otherwise TestCPUProfileMultithreaded can fail if only one
 	// thread is scheduled during the testing period.
 	t0 := time.Now()
+	accum := *y
 	for i := 0; i < 500 || time.Since(t0) < dur; i++ {
-		f()
+		accum = f(accum)
 	}
+	*y = accum
 }
 
 var (
@@ -46,8 +48,8 @@ var (
 // The actual CPU hogging function.
 // Must not call other functions nor access heap/globals in the loop,
 // otherwise under race detector the samples will be in the race runtime.
-func cpuHog1() int {
-	foo := salt1
+func cpuHog1(x int) int {
+	foo := x
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
 			foo *= foo
@@ -58,8 +60,8 @@ func cpuHog1() int {
 	return foo
 }
 
-func cpuHog2() int {
-	foo := salt2
+func cpuHog2(x int) int {
+	foo := x
 	for i := 0; i < 1e5; i++ {
 		if foo > 0 {
 			foo *= foo
@@ -72,7 +74,7 @@ func cpuHog2() int {
 
 func TestCPUProfile(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.cpuHog1"}, func(dur time.Duration) {
-		cpuHogger(cpuHog1, dur)
+		cpuHogger(cpuHog1, &salt1, dur)
 	})
 }
 
@@ -81,29 +83,29 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.cpuHog1", "pprof.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
-			cpuHogger(cpuHog1, dur)
+			cpuHogger(cpuHog1, &salt1, dur)
 			c <- 1
 		}()
-		cpuHogger(cpuHog2, dur)
+		cpuHogger(cpuHog2, &salt2, dur)
 		<-c
 	})
 }
 
 func TestCPUProfileInlining(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, func(dur time.Duration) {
-		cpuHogger(inlinedCaller, dur)
+		cpuHogger(inlinedCaller, &salt1, dur)
 	})
 }
 
-func inlinedCaller() int {
-	inlinedCallee()
-	return 0
+func inlinedCaller(x int) int {
+	x = inlinedCallee(x)
+	return x
 }
 
-func inlinedCallee() {
+func inlinedCallee(x int) int {
 	// We could just use cpuHog1, but for loops prevent inlining
 	// right now. :(
-	foo := salt1
+	foo := x
 	i := 0
 loop:
 	if foo > 0 {
@@ -114,7 +116,7 @@ loop:
 	if i++; i < 1e5 {
 		goto loop
 	}
-	salt1 = foo
+	return foo
 }
 
 func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
@@ -177,9 +179,9 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 		}
 	}
 
-	if badOS[runtime.GOOS] {
+	switch runtime.GOOS {
+	case "darwin", "dragonfly", "netbsd", "solaris":
 		t.Skipf("ignoring failure on %s; see golang.org/issue/13841", runtime.GOOS)
-		return
 	}
 	// Ignore the failure if the tests are running in a QEMU-based emulator,
 	// QEMU is not perfect at emulating everything.
@@ -187,7 +189,6 @@ func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 	// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
 	if os.Getenv("IN_QEMU") == "1" {
 		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
-		return
 	}
 	t.FailNow()
 }
@@ -394,60 +395,108 @@ func TestMathBigDivide(t *testing.T) {
 	})
 }
 
-// Operating systems that are expected to fail the tests. See issue 13841.
-var badOS = map[string]bool{
-	"darwin":    true,
-	"netbsd":    true,
-	"plan9":     true,
-	"dragonfly": true,
-	"solaris":   true,
-}
-
 func TestBlockProfile(t *testing.T) {
 	t.Skip("lots of details are different for gccgo; FIXME")
 	type TestCase struct {
 		name string
 		f    func()
+		stk  []string
 		re   string
 	}
 	tests := [...]TestCase{
-		{"chan recv", blockChanRecv, `
+		{
+			name: "chan recv",
+			f:    blockChanRecv,
+			stk: []string{
+				"runtime.chanrecv1",
+				"runtime/pprof.blockChanRecv",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanRecv\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"chan send", blockChanSend, `
+		{
+			name: "chan send",
+			f:    blockChanSend,
+			stk: []string{
+				"runtime.chansend1",
+				"runtime/pprof.blockChanSend",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chansend1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanSend\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"chan close", blockChanClose, `
+		{
+			name: "chan close",
+			f:    blockChanClose,
+			stk: []string{
+				"runtime.chanrecv1",
+				"runtime/pprof.blockChanClose",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockChanClose\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"select recv async", blockSelectRecvAsync, `
+		{
+			name: "select recv async",
+			f:    blockSelectRecvAsync,
+			stk: []string{
+				"runtime.selectgo",
+				"runtime/pprof.blockSelectRecvAsync",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockSelectRecvAsync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"select send sync", blockSelectSendSync, `
+		{
+			name: "select send sync",
+			f:    blockSelectSendSync,
+			stk: []string{
+				"runtime.selectgo",
+				"runtime/pprof.blockSelectSendSync",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockSelectSendSync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"mutex", blockMutex, `
+		{
+			name: "mutex",
+			f:    blockMutex,
+			stk: []string{
+				"sync.(*Mutex).Lock",
+				"runtime/pprof.blockMutex",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9a-f]+	.*/src/sync/mutex\.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockMutex\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
-		{"cond", blockCond, `
+		{
+			name: "cond",
+			f:    blockCond,
+			stk: []string{
+				"sync.(*Cond).Wait",
+				"runtime/pprof.blockCond",
+				"runtime/pprof.TestBlockProfile",
+			},
+			re: `
 [0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
 #	0x[0-9a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9a-f]+	.*/src/sync/cond\.go:[0-9]+
 #	0x[0-9a-f]+	runtime/pprof\.blockCond\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
@@ -455,28 +504,84 @@ func TestBlockProfile(t *testing.T) {
 `},
 	}
 
+	// Generate block profile
 	runtime.SetBlockProfileRate(1)
 	defer runtime.SetBlockProfileRate(0)
 	for _, test := range tests {
 		test.f()
 	}
-	var w bytes.Buffer
-	Lookup("block").WriteTo(&w, 1)
-	prof := w.String()
 
-	if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
-		t.Fatalf("Bad profile header:\n%v", prof)
-	}
+	t.Run("debug=1", func(t *testing.T) {
+		var w bytes.Buffer
+		Lookup("block").WriteTo(&w, 1)
+		prof := w.String()
 
-	if strings.HasSuffix(prof, "#\t0x0\n\n") {
-		t.Errorf("Useless 0 suffix:\n%v", prof)
+		if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
+			t.Fatalf("Bad profile header:\n%v", prof)
+		}
+
+		if strings.HasSuffix(prof, "#\t0x0\n\n") {
+			t.Errorf("Useless 0 suffix:\n%v", prof)
+		}
+
+		for _, test := range tests {
+			if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
+				t.Errorf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+			}
+		}
+	})
+
+	t.Run("proto", func(t *testing.T) {
+		// proto format
+		var w bytes.Buffer
+		Lookup("block").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		stks := stacks(p)
+		for _, test := range tests {
+			if !containsStack(stks, test.stk) {
+				t.Errorf("No matching stack entry for %v, want %+v", test.name, test.stk)
+			}
+		}
+	})
+
+}
+
+func stacks(p *profile.Profile) (res [][]string) {
+	for _, s := range p.Sample {
+		var stk []string
+		for _, l := range s.Location {
+			for _, line := range l.Line {
+				stk = append(stk, line.Function.Name)
+			}
+		}
+		res = append(res, stk)
 	}
+	return res
+}
 
-	for _, test := range tests {
-		if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
-			t.Fatalf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+func containsStack(got [][]string, want []string) bool {
+	for _, stk := range got {
+		if len(stk) < len(want) {
+			continue
+		}
+		for i, f := range want {
+			if f != stk[i] {
+				break
+			}
+			if i == len(want)-1 {
+				return true
+			}
 		}
 	}
+	return false
 }
 
 const blockDelay = 10 * time.Millisecond
@@ -568,6 +673,8 @@ func blockCond() {
 }
 
 func TestMutexProfile(t *testing.T) {
+	// Generate mutex profile
+
 	old := runtime.SetMutexProfileFraction(1)
 	defer runtime.SetMutexProfileFraction(old)
 	if old != 0 {
@@ -576,39 +683,60 @@ func TestMutexProfile(t *testing.T) {
 
 	blockMutex()
 
-	var w bytes.Buffer
-	Lookup("mutex").WriteTo(&w, 1)
-	prof := w.String()
+	t.Run("debug=1", func(t *testing.T) {
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 1)
+		prof := w.String()
+		t.Logf("received profile: %v", prof)
 
-	if !strings.HasPrefix(prof, "--- mutex:\ncycles/second=") {
-		t.Errorf("Bad profile header:\n%v", prof)
-	}
-	prof = strings.Trim(prof, "\n")
-	lines := strings.Split(prof, "\n")
-	// gccgo adds an extra line in the stack trace, not sure why.
-	if len(lines) < 6 {
-		t.Errorf("expected 6 lines, got %d %q\n%s", len(lines), prof, prof)
-	}
-	if len(lines) < 6 {
-		return
-	}
-	// checking that the line is like "35258904 1 @ 0x48288d 0x47cd28 0x458931"
-	r2 := `^\d+ 1 @(?: 0x[[:xdigit:]]+)+`
-	//r2 := "^[0-9]+ 1 @ 0x[0-9a-f x]+$"
-	if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
-		t.Errorf("%q didn't match %q", lines[3], r2)
-	}
-	r3 := "^#.*pprof.\\$nested.*$"
-	match := false
-	for _, i := range []int{5, 6} {
-		if ok, _ := regexp.MatchString(r3, lines[i]); ok {
-			match = true
-			break
+		if !strings.HasPrefix(prof, "--- mutex:\ncycles/second=") {
+			t.Errorf("Bad profile header:\n%v", prof)
 		}
-	}
-	if !match {
-		t.Errorf("neither %q nor %q matched %q", lines[5], lines[6], r3)
-	}
+		prof = strings.Trim(prof, "\n")
+		lines := strings.Split(prof, "\n")
+		if len(lines) != 6 {
+			t.Errorf("expected 6 lines, got %d %q\n%s", len(lines), prof, prof)
+		}
+		if len(lines) < 6 {
+			return
+		}
+		// checking that the line is like "35258904 1 @ 0x48288d 0x47cd28 0x458931"
+		r2 := `^\d+ 1 @(?: 0x[[:xdigit:]]+)+`
+		//r2 := "^[0-9]+ 1 @ 0x[0-9a-f x]+$"
+		if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
+			t.Errorf("%q didn't match %q", lines[3], r2)
+		}
+		if runtime.Compiler != "gccgo" {
+			r3 := "^#.*pprof.blockMutex.*$"
+			if ok, err := regexp.MatchString(r3, lines[5]); err != nil || !ok {
+				t.Errorf("%q didn't match %q", lines[5], r3)
+			}
+		}
+		t.Logf(prof)
+	})
+	t.Run("proto", func(t *testing.T) {
+		// proto format
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		stks := stacks(p)
+		for _, want := range [][]string{
+			// {"sync.(*Mutex).Unlock", "pprof.blockMutex.func1"},
+			{"sync.Unlock.pN10_sync.Mutex", "pprof.$nested17"},
+		} {
+			if !containsStack(stks, want) {
+				t.Errorf("No matching stack entry for %+v", want)
+			}
+		}
+	})
 }
 
 func func1(c chan int) { <-c }
@@ -725,7 +853,7 @@ func TestEmptyCallStack(t *testing.T) {
 func TestCPUProfileLabel(t *testing.T) {
 	testCPUProfile(t, []string{"pprof.cpuHogger;key=value"}, func(dur time.Duration) {
 		Do(context.Background(), Labels("key", "value"), func(context.Context) {
-			cpuHogger(cpuHog1, dur)
+			cpuHogger(cpuHog1, &salt1, dur)
 		})
 	})
 }
@@ -738,14 +866,15 @@ func TestLabelRace(t *testing.T) {
 		start := time.Now()
 		var wg sync.WaitGroup
 		for time.Since(start) < dur {
+			var salts [10]int
 			for i := 0; i < 10; i++ {
 				wg.Add(1)
-				go func() {
+				go func(j int) {
 					Do(context.Background(), Labels("key", "value"), func(context.Context) {
-						cpuHogger(cpuHog1, time.Millisecond)
+						cpuHogger(cpuHog1, &salts[j], time.Millisecond)
 					})
 					wg.Done()
-				}()
+				}(i)
 			}
 			wg.Wait()
 		}
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
index 5e1d71c7e72..793be44a417 100644
--- a/libgo/go/runtime/pprof/proto.go
+++ b/libgo/go/runtime/pprof/proto.go
@@ -202,7 +202,7 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 	// the stack and we have return PCs anyway.
 	frames := runtime.CallersFrames([]uintptr{addr})
 	frame, more := frames.Next()
-	if frame.Function == "runtime.goexit" {
+	if frame.Function == "runtime.goexit" || frame.Function == "runtime.kickoff" {
 		// Short-circuit if we see runtime.goexit so the loop
 		// below doesn't allocate a useless empty location.
 		return 0
@@ -228,7 +228,7 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 	start := b.pb.startMessage()
 	b.pb.uint64Opt(tagLocation_ID, id)
 	b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
-	for frame.Function != "runtime.goexit" {
+	for frame.Function != "runtime.goexit" && frame.Function != "runtime.kickoff" {
 		// Write out each line in frame expansion.
 		funcID := uint64(b.funcs[frame.Function])
 		if funcID == 0 {
diff --git a/libgo/go/runtime/print.go b/libgo/go/runtime/print.go
index 4db726a7552..3da05ad5f9e 100644
--- a/libgo/go/runtime/print.go
+++ b/libgo/go/runtime/print.go
@@ -78,7 +78,7 @@ var debuglock mutex
 
 // The compiler emits calls to printlock and printunlock around
 // the multiple calls that implement a single Go print or println
-// statement. Some of the print helpers (printsp, for example)
+// statement. Some of the print helpers (printslice, for example)
 // call print recursively. There is also the problem of a crash
 // happening during the print routines and needing to acquire
 // the print lock to print information about the crash.
@@ -120,31 +120,31 @@ func gwrite(b []byte) {
 }
 
 func printsp() {
-	print(" ")
+	printstring(" ")
 }
 
 func printnl() {
-	print("\n")
+	printstring("\n")
 }
 
 func printbool(v bool) {
 	if v {
-		print("true")
+		printstring("true")
 	} else {
-		print("false")
+		printstring("false")
 	}
 }
 
 func printfloat(v float64) {
 	switch {
 	case v != v:
-		print("NaN")
+		printstring("NaN")
 		return
 	case v+v == v && v > 0:
-		print("+Inf")
+		printstring("+Inf")
 		return
 	case v+v == v && v < 0:
-		print("-Inf")
+		printstring("-Inf")
 		return
 	}
 
@@ -226,7 +226,7 @@ func printuint(v uint64) {
 
 func printint(v int64) {
 	if v < 0 {
-		print("-")
+		printstring("-")
 		v = -v
 	}
 	printuint(uint64(v))
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 345f57b6875..1ea41528600 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -34,6 +34,7 @@ import (
 //go:linkname helpgc runtime.helpgc
 //go:linkname kickoff runtime.kickoff
 //go:linkname mstart1 runtime.mstart1
+//go:linkname mexit runtime.mexit
 //go:linkname globrunqput runtime.globrunqput
 //go:linkname pidleget runtime.pidleget
 
@@ -54,6 +55,7 @@ func getTraceback(me, gp *g)
 func gtraceback(*g)
 func _cgo_notify_runtime_init_done()
 func alreadyInCallers() bool
+func stackfree(*g)
 
 // Functions created by the compiler.
 //extern __go_init_main
@@ -138,6 +140,9 @@ var (
 // it is closed, meaning cgocallbackg can reliably receive from it.
 var main_init_done chan bool
 
+// mainStarted indicates that the main M has started.
+var mainStarted bool
+
 // runtimeInitTime is the nanotime() at which the runtime started.
 var runtimeInitTime int64
 
@@ -157,8 +162,8 @@ func main() {
 		maxstacksize = 250000000
 	}
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
+	// Allow newproc to start new Ms.
+	mainStarted = true
 
 	systemstack(func() {
 		newm(sysmon, nil)
@@ -184,8 +189,15 @@ func main() {
 		}
 	}()
 
+	// Record when the world started. Must be after runtime_init
+	// because nanotime on some platforms depends on startNano.
+	runtimeInitTime = nanotime()
+
 	main_init_done = make(chan bool)
 	if iscgo {
+		// Start the template thread in case we enter Go from
+		// a C-created thread and need to create a new thread.
+		startTemplateThread()
 		_cgo_notify_runtime_init_done()
 	}
 
@@ -269,9 +281,10 @@ func forcegchelper() {
 	}
 }
 
+//go:nosplit
+
 // Gosched yields the processor, allowing other goroutines to run. It does not
 // suspend the current goroutine, so execution resumes automatically.
-//go:nosplit
 func Gosched() {
 	mcall(gosched_m)
 }
@@ -359,8 +372,8 @@ func releaseSudog(s *sudog) {
 	if s.elem != nil {
 		throw("runtime: sudog with non-nil elem")
 	}
-	if s.selectdone != nil {
-		throw("runtime: sudog with non-nil selectdone")
+	if s.isSelect {
+		throw("runtime: sudog with non-false isSelect")
 	}
 	if s.next != nil {
 		throw("runtime: sudog with non-nil next")
@@ -419,7 +432,7 @@ func funcPC(f interface{}) uintptr {
 
 func lockedOSThread() bool {
 	gp := getg()
-	return gp.lockedm != nil && gp.m.lockedg != nil
+	return gp.lockedm != 0 && gp.m.lockedg != 0
 }
 
 var (
@@ -479,13 +492,21 @@ func schedinit() {
 	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
 		procs = n
 	}
-	if procs > _MaxGomaxprocs {
-		procs = _MaxGomaxprocs
-	}
 	if procresize(procs) != nil {
 		throw("unknown runnable goroutine during bootstrap")
 	}
 
+	// For cgocheck > 1, we turn on the write barrier at all times
+	// and check all pointer writes. We can't do this until after
+	// procresize because the write barrier needs a P.
+	if debug.cgocheck > 1 {
+		writeBarrier.cgo = true
+		writeBarrier.enabled = true
+		for _, p := range allp {
+			p.wbBuf.reset()
+		}
+	}
+
 	if buildVersion == "" {
 		// Condition should never trigger. This code just serves
 		// to ensure runtime·buildVersion is kept in the resulting binary.
@@ -501,7 +522,7 @@ func dumpgstatus(gp *g) {
 
 func checkmcount() {
 	// sched lock is held
-	if sched.mcount > sched.maxmcount {
+	if mcount() > sched.maxmcount {
 		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
 		throw("thread exhaustion")
 	}
@@ -515,15 +536,20 @@ func mcommoninit(mp *m) {
 		callers(1, mp.createstack[:])
 	}
 
-	mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
-	if mp.fastrand == 0 {
-		mp.fastrand = 0x49f6428a
-	}
-
 	lock(&sched.lock)
-	mp.id = sched.mcount
-	sched.mcount++
+	if sched.mnext+1 < sched.mnext {
+		throw("runtime: thread ID overflow")
+	}
+	mp.id = sched.mnext
+	sched.mnext++
 	checkmcount()
+
+	mp.fastrand[0] = 1597334677 * uint32(mp.id)
+	mp.fastrand[1] = uint32(cputicks())
+	if mp.fastrand[0]|mp.fastrand[1] == 0 {
+		mp.fastrand[1] = 1
+	}
+
 	mpreinit(mp)
 
 	// Add to allm so garbage collector doesn't free g->m
@@ -735,8 +761,10 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		// _Grunning or _Grunning|_Gscan; either way,
 		// we own gp.gcscanvalid, so it's safe to read.
 		// gp.gcscanvalid must not be true when we are running.
-		print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-		throw("casgstatus")
+		systemstack(func() {
+			print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
+			throw("casgstatus")
+		})
 	}
 
 	// See http://golang.org/cl/21503 for justification of the yield delay.
@@ -912,7 +940,7 @@ func stopTheWorld(reason string) {
 
 // startTheWorld undoes the effects of stopTheWorld.
 func startTheWorld() {
-	systemstack(startTheWorldWithSema)
+	systemstack(func() { startTheWorldWithSema(false) })
 	// worldsema must be held over startTheWorldWithSema to ensure
 	// gomaxprocs cannot change while worldsema is held.
 	semrelease(&worldsema)
@@ -962,8 +990,7 @@ func stopTheWorldWithSema() {
 	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	sched.stopwait--
 	// try to retake all P's in Psyscall status
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		s := p.status
 		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
 			if trace.enabled {
@@ -1003,8 +1030,7 @@ func stopTheWorldWithSema() {
 	if sched.stopwait != 0 {
 		bad = "stopTheWorld: not stopped (stopwait != 0)"
 	} else {
-		for i := 0; i < int(gomaxprocs); i++ {
-			p := allp[i]
+		for _, p := range allp {
 			if p.status != _Pgcstop {
 				bad = "stopTheWorld: not stopped (status != _Pgcstop)"
 			}
@@ -1028,12 +1054,14 @@ func mhelpgc() {
 	_g_.m.helpgc = -1
 }
 
-func startTheWorldWithSema() {
+func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	_g_ := getg()
 
-	_g_.m.locks++        // disable preemption because it can be holding p in a local var
-	gp := netpoll(false) // non-blocking
-	injectglist(gp)
+	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+	if netpollinited() {
+		gp := netpoll(false) // non-blocking
+		injectglist(gp)
+	}
 	add := needaddgcproc()
 	lock(&sched.lock)
 
@@ -1068,6 +1096,12 @@ func startTheWorldWithSema() {
 		}
 	}
 
+	// Capture start-the-world time before doing clean-up tasks.
+	startTime := nanotime()
+	if emitTraceEvent {
+		traceGCSTWDone()
+	}
+
 	// Wakeup an additional proc in case we have excessive runnable goroutines
 	// in local queues or in the global queue. If we don't, the proc will park itself.
 	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
@@ -1086,6 +1120,8 @@ func startTheWorldWithSema() {
 		newm(mhelpgc, nil)
 	}
 	_g_.m.locks--
+
+	return startTime
 }
 
 // First function run by a new goroutine.
@@ -1116,15 +1152,13 @@ func kickoff() {
 			throw("no p in kickoff")
 		}
 	}
-
 	gp.param = nil
 
 	fv(param)
 	goexit1()
 }
 
-// This is called from mstart.
-func mstart1() {
+func mstart1(dummy int32) {
 	_g_ := getg()
 
 	if _g_ != _g_.m.g0 {
@@ -1137,12 +1171,7 @@ func mstart1() {
 	// prepare the thread to be able to handle the signals.
 	// For gccgo minit was called by C code.
 	if _g_.m == &m0 {
-		// Create an extra M for callbacks on threads not created by Go.
-		if iscgo && !cgoHasExtraM {
-			cgoHasExtraM = true
-			newextram()
-		}
-		initsig(false)
+		mstartm0()
 	}
 
 	if fn := _g_.m.mstartfn; fn != nil {
@@ -1159,6 +1188,114 @@ func mstart1() {
 	schedule()
 }
 
+// mstartm0 implements part of mstart1 that only runs on the m0.
+//
+// Write barriers are allowed here because we know the GC can't be
+// running yet, so they'll be no-ops.
+//
+//go:yeswritebarrierrec
+func mstartm0() {
+	// Create an extra M for callbacks on threads not created by Go.
+	if iscgo && !cgoHasExtraM {
+		cgoHasExtraM = true
+		newextram()
+	}
+	initsig(false)
+}
+
+// mexit tears down and exits the current thread.
+//
+// Don't call this directly to exit the thread, since it must run at
+// the top of the thread stack. Instead, use gogo(&_g_.m.g0.sched) to
+// unwind the stack to the point that exits the thread.
+//
+// It is entered with m.p != nil, so write barriers are allowed. It
+// will release the P before exiting.
+//
+//go:yeswritebarrierrec
+func mexit(osStack bool) {
+	g := getg()
+	m := g.m
+
+	if m == &m0 {
+		// This is the main thread. Just wedge it.
+		//
+		// On Linux, exiting the main thread puts the process
+		// into a non-waitable zombie state. On Plan 9,
+		// exiting the main thread unblocks wait even though
+		// other threads are still running. On Solaris we can
+		// neither exitThread nor return from mstart. Other
+		// bad things probably happen on other platforms.
+		//
+		// We could try to clean up this M more before wedging
+		// it, but that complicates signal handling.
+		handoffp(releasep())
+		lock(&sched.lock)
+		sched.nmfreed++
+		checkdead()
+		unlock(&sched.lock)
+		notesleep(&m.park)
+		throw("locked m0 woke up")
+	}
+
+	sigblock()
+	unminit()
+
+	// Free the gsignal stack.
+	if m.gsignal != nil {
+		stackfree(m.gsignal)
+	}
+
+	// Remove m from allm.
+	lock(&sched.lock)
+	for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink {
+		if *pprev == m {
+			*pprev = m.alllink
+			goto found
+		}
+	}
+	throw("m not found in allm")
+found:
+	if !osStack {
+		// Delay reaping m until it's done with the stack.
+		//
+		// If this is using an OS stack, the OS will free it
+		// so there's no need for reaping.
+		atomic.Store(&m.freeWait, 1)
+		// Put m on the free list, though it will not be reaped until
+		// freeWait is 0. Note that the free list must not be linked
+		// through alllink because some functions walk allm without
+		// locking, so may be using alllink.
+		m.freelink = sched.freem
+		sched.freem = m
+	}
+	unlock(&sched.lock)
+
+	// Release the P.
+	handoffp(releasep())
+	// After this point we must not have write barriers.
+
+	// Invoke the deadlock detector. This must happen after
+	// handoffp because it may have started a new M to take our
+	// P's work.
+	lock(&sched.lock)
+	sched.nmfreed++
+	checkdead()
+	unlock(&sched.lock)
+
+	if osStack {
+		// Return from mstart and let the system thread
+		// library free the g0 stack and terminate the thread.
+		return
+	}
+
+	// mstart is the thread's entry point, so there's nothing to
+	// return to. Exit the thread directly. exitThread will clear
+	// m.freeWait when it's done with the stack and the m can be
+	// reaped.
+	exitThread(&m.freeWait)
+}
+
 // forEachP calls fn(p) for every P p when p reaches a GC safe point.
 // If a P is currently executing code, this will bring the P to a GC
 // safe point and execute fn on that P. If the P is not executing code
@@ -1182,7 +1319,7 @@ func forEachP(fn func(*p)) {
 	sched.safePointFn = fn
 
 	// Ask all Ps to run the safe point function.
-	for _, p := range allp[:gomaxprocs] {
+	for _, p := range allp {
 		if p != _p_ {
 			atomic.Store(&p.runSafePointFn, 1)
 		}
@@ -1210,8 +1347,7 @@ func forEachP(fn func(*p)) {
 
 	// Force Ps currently in _Psyscall into _Pidle and hand them
 	// off to induce safe point function execution.
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		s := p.status
 		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
 			if trace.enabled {
@@ -1240,8 +1376,7 @@ func forEachP(fn func(*p)) {
 	if sched.safePointWait != 0 {
 		throw("forEachP: not done")
 	}
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
+	for _, p := range allp {
 		if p.runSafePointFn != 0 {
 			throw("forEachP: P did not run fn")
 		}
@@ -1295,6 +1430,27 @@ func allocm(_p_ *p, fn func(), allocatestack bool) (mp *m, g0Stack unsafe.Pointe
 	if _g_.m.p == 0 {
 		acquirep(_p_) // temporarily borrow p for mallocs in this function
 	}
+
+	// Release the free M list. We need to do this somewhere and
+	// this may free up a stack we can use.
+	if sched.freem != nil {
+		lock(&sched.lock)
+		var newList *m
+		for freem := sched.freem; freem != nil; {
+			if freem.freeWait != 0 {
+				next := freem.freelink
+				freem.freelink = newList
+				newList = freem
+				freem = next
+				continue
+			}
+			stackfree(freem.g0)
+			freem = freem.freelink
+		}
+		sched.freem = newList
+		unlock(&sched.lock)
+	}
+
 	mp = new(m)
 	mp.mstartfn = fn
 	mcommoninit(mp)
@@ -1431,9 +1587,9 @@ func oneNewExtraM() {
 	casgstatus(gp, _Gidle, _Gdead)
 	gp.m = mp
 	mp.curg = gp
-	mp.locked = _LockInternal
-	mp.lockedg = gp
-	gp.lockedm = mp
+	mp.lockedInt++
+	mp.lockedg.set(gp)
+	gp.lockedm.set(mp)
 	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
 	// put on allg for garbage collector
 	allgadd(gp)
@@ -1574,6 +1730,27 @@ func unlockextra(mp *m) {
 // around exec'ing while creating/destroying threads.  See issue #19546.
 var execLock rwmutex
 
+// newmHandoff contains a list of m structures that need new OS threads.
+// This is used by newm in situations where newm itself can't safely
+// start an OS thread.
+var newmHandoff struct {
+	lock mutex
+
+	// newm points to a list of M structures that need new OS
+	// threads. The list is linked through m.schedlink.
+	newm muintptr
+
+	// waiting indicates that wake needs to be notified when an m
+	// is put on the list.
+	waiting bool
+	wake    note
+
+	// haveTemplateThread indicates that the templateThread has
+	// been started. This is not protected by lock. Use cas to set
+	// to 1.
+	haveTemplateThread uint32
+}
+
 // Create a new m. It will start off with a call to fn, or else the scheduler.
 // fn needs to be static and not a heap allocated closure.
 // May run with m.p==nil, so write barriers are not allowed.
@@ -1582,11 +1759,90 @@ func newm(fn func(), _p_ *p) {
 	mp, _, _ := allocm(_p_, fn, false)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
+	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
+		// We're on a locked M or a thread that may have been
+		// started by C. The kernel state of this thread may
+		// be strange (the user may have locked it for that
+		// purpose). We don't want to clone that into another
+		// thread. Instead, ask a known-good thread to create
+		// the thread for us.
+		//
+		// This is disabled on Plan 9. See golang.org/issue/22227.
+		//
+		// TODO: This may be unnecessary on Windows, which
+		// doesn't model thread creation off fork.
+		lock(&newmHandoff.lock)
+		if newmHandoff.haveTemplateThread == 0 {
+			throw("on a locked thread with no template thread")
+		}
+		mp.schedlink = newmHandoff.newm
+		newmHandoff.newm.set(mp)
+		if newmHandoff.waiting {
+			newmHandoff.waiting = false
+			notewakeup(&newmHandoff.wake)
+		}
+		unlock(&newmHandoff.lock)
+		return
+	}
+	newm1(mp)
+}
+
+func newm1(mp *m) {
 	execLock.rlock() // Prevent process clone.
 	newosproc(mp)
 	execLock.runlock()
 }
 
+// startTemplateThread starts the template thread if it is not already
+// running.
+//
+// The calling thread must itself be in a known-good state.
+func startTemplateThread() {
+	if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) {
+		return
+	}
+	newm(templateThread, nil)
+}
+
+// tmeplateThread is a thread in a known-good state that exists solely
+// to start new threads in known-good states when the calling thread
+// may not be a a good state.
+//
+// Many programs never need this, so templateThread is started lazily
+// when we first enter a state that might lead to running on a thread
+// in an unknown state.
+//
+// templateThread runs on an M without a P, so it must not have write
+// barriers.
+//
+//go:nowritebarrierrec
+func templateThread() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
+	for {
+		lock(&newmHandoff.lock)
+		for newmHandoff.newm != 0 {
+			newm := newmHandoff.newm.ptr()
+			newmHandoff.newm = 0
+			unlock(&newmHandoff.lock)
+			for newm != nil {
+				next := newm.schedlink.ptr()
+				newm.schedlink = 0
+				newm1(newm)
+				newm = next
+			}
+			lock(&newmHandoff.lock)
+		}
+		newmHandoff.waiting = true
+		noteclear(&newmHandoff.wake)
+		unlock(&newmHandoff.lock)
+		notesleep(&newmHandoff.wake)
+	}
+}
+
 // Stops execution of the current m until new work is available.
 // Returns with acquired P.
 func stopm() {
@@ -1609,7 +1865,9 @@ retry:
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
 	if _g_.m.helpgc != 0 {
+		// helpgc() set _g_.m.p and _g_.m.mcache, so we have a P.
 		gchelper()
+		// Undo the effects of helpgc().
 		_g_.m.helpgc = 0
 		_g_.m.mcache = nil
 		_g_.m.p = 0
@@ -1743,7 +2001,7 @@ func wakep() {
 func stoplockedm() {
 	_g_ := getg()
 
-	if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
+	if _g_.m.lockedg == 0 || _g_.m.lockedg.ptr().lockedm.ptr() != _g_.m {
 		throw("stoplockedm: inconsistent locking")
 	}
 	if _g_.m.p != 0 {
@@ -1755,7 +2013,7 @@ func stoplockedm() {
 	// Wait until another thread schedules lockedg again.
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
-	status := readgstatus(_g_.m.lockedg)
+	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
 		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
 		dumpgstatus(_g_)
@@ -1771,7 +2029,7 @@ func stoplockedm() {
 func startlockedm(gp *g) {
 	_g_ := getg()
 
-	mp := gp.lockedm
+	mp := gp.lockedm.ptr()
 	if mp == _g_.m {
 		throw("startlockedm: locked to me")
 	}
@@ -1896,11 +2154,12 @@ top:
 
 	// Poll network.
 	// This netpoll is only an optimization before we resort to stealing.
-	// We can safely skip it if there a thread blocked in netpoll already.
-	// If there is any kind of logical race with that blocked thread
-	// (e.g. it has already returned from netpoll, but does not set lastpoll yet),
-	// this thread will do blocking netpoll below anyway.
-	if netpollinited() && sched.lastpoll != 0 {
+	// We can safely skip it if there are no waiters or a thread is blocked
+	// in netpoll already. If there is any kind of logical race with that
+	// blocked thread (e.g. it has already returned from netpoll, but does
+	// not set lastpoll yet), this thread will do blocking netpoll below
+	// anyway.
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
 		if gp := netpoll(false); gp != nil { // non-blocking
 			// netpoll returns list of goroutines linked by schedlink.
 			injectglist(gp.schedlink.ptr())
@@ -1996,9 +2255,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for i := 0; i < int(gomaxprocs); i++ {
-		_p_ := allp[i]
-		if _p_ != nil && !runqempty(_p_) {
+	for _, _p_ := range allp {
+		if !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -2137,9 +2395,15 @@ func schedule() {
 		throw("schedule: holding locks")
 	}
 
-	if _g_.m.lockedg != nil {
+	if _g_.m.lockedg != 0 {
 		stoplockedm()
-		execute(_g_.m.lockedg, false) // Never returns.
+		execute(_g_.m.lockedg.ptr(), false) // Never returns.
+	}
+
+	// We should not schedule away from a g that is executing a cgo call,
+	// since the cgo call is using the m's g0 stack.
+	if _g_.m.incgo {
+		throw("schedule: in cgo")
 	}
 
 top:
@@ -2205,7 +2469,7 @@ top:
 		resetspinning()
 	}
 
-	if gp.lockedm != nil {
+	if gp.lockedm != 0 {
 		// Hands off own p to the locked m,
 		// then blocks waiting for a new p.
 		startlockedm(gp)
@@ -2322,8 +2586,9 @@ func goexit0(gp *g) {
 		gp.isSystemGoroutine = false
 	}
 	gp.m = nil
-	gp.lockedm = nil
-	_g_.m.lockedg = nil
+	locked := gp.lockedm != 0
+	gp.lockedm = 0
+	_g_.m.lockedg = 0
 	gp.entry = nil
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
@@ -2334,17 +2599,38 @@ func goexit0(gp *g) {
 	gp.labels = nil
 	gp.timer = nil
 
+	if gcBlackenEnabled != 0 && gp.gcAssistBytes > 0 {
+		// Flush assist credit to the global pool. This gives
+		// better information to pacing if the application is
+		// rapidly creating an exiting goroutines.
+		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
+		gp.gcAssistBytes = 0
+	}
+
 	// Note that gp's stack scan is now "valid" because it has no
 	// stack.
 	gp.gcscanvalid = true
 	dropg()
 
-	if _g_.m.locked&^_LockExternal != 0 {
-		print("invalid m->locked = ", _g_.m.locked, "\n")
+	if _g_.m.lockedInt != 0 {
+		print("invalid m->lockedInt = ", _g_.m.lockedInt, "\n")
 		throw("internal lockOSThread error")
 	}
-	_g_.m.locked = 0
+	_g_.m.lockedExt = 0
 	gfput(_g_.m.p.ptr(), gp)
+	if locked {
+		// The goroutine may have locked this thread because
+		// it put it in an unusual kernel state. Kill it
+		// rather than returning it to the thread pool.
+
+		// Return to mstart, which will release the P and exit
+		// the thread.
+		if GOOS != "plan9" { // See golang.org/issue/22227.
+			_g_.m.exiting = true
+			gogo(_g_.m.g0)
+		}
+	}
 	schedule()
 }
 
@@ -2481,7 +2767,9 @@ func exitsyscall(dummy int32) {
 	oldp := _g_.m.p.ptr()
 	if exitsyscallfast() {
 		if _g_.m.mcache == nil {
-			throw("lost mcache")
+			systemstack(func() {
+				throw("lost mcache")
+			})
 		}
 		if trace.enabled {
 			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
@@ -2519,7 +2807,9 @@ func exitsyscall(dummy int32) {
 	mcall(exitsyscall0)
 
 	if _g_.m.mcache == nil {
-		throw("lost mcache")
+		systemstack(func() {
+			throw("lost mcache")
+		})
 	}
 
 	// Scheduler returned, so we're allowed to run now.
@@ -2644,7 +2934,7 @@ func exitsyscall0(gp *g) {
 		acquirep(_p_)
 		execute(gp, false) // Never returns.
 	}
-	if _g_.m.lockedg != nil {
+	if _g_.m.lockedg != 0 {
 		// Wait until another thread schedules gp and so m again.
 		stoplockedm()
 		execute(gp, false) // Never returns.
@@ -2798,7 +3088,7 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	newg.entry = entry
 
 	newg.param = arg
-	newg.gopc = getcallerpc(unsafe.Pointer(&fn))
+	newg.gopc = getcallerpc()
 	newg.startpc = fn
 	if _g_.m.curg != nil {
 		newg.labels = _g_.m.curg.labels
@@ -2827,7 +3117,7 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 
 	runqput(_p_, newg, true)
 
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && runtimeInitTime != 0 {
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && mainStarted {
 		wakep()
 	}
 	_g_.m.locks--
@@ -2947,23 +3237,41 @@ func Breakpoint() {
 //go:nosplit
 func dolockOSThread() {
 	_g_ := getg()
-	_g_.m.lockedg = _g_
-	_g_.lockedm = _g_.m
+	_g_.m.lockedg.set(_g_)
+	_g_.lockedm.set(_g_.m)
 }
 
 //go:nosplit
 
 // LockOSThread wires the calling goroutine to its current operating system thread.
-// Until the calling goroutine exits or calls UnlockOSThread, it will always
-// execute in that thread, and no other goroutine can.
+// The calling goroutine will always execute in that thread,
+// and no other goroutine will execute in it,
+// until the calling goroutine has made as many calls to
+// UnlockOSThread as to LockOSThread.
+// If the calling goroutine exits without unlocking the thread,
+// the thread will be terminated.
+//
+// A goroutine should call LockOSThread before calling OS services or
+// non-Go library functions that depend on per-thread state.
 func LockOSThread() {
-	getg().m.locked |= _LockExternal
+	if atomic.Load(&newmHandoff.haveTemplateThread) == 0 && GOOS != "plan9" {
+		// If we need to start a new thread from the locked
+		// thread, we need the template thread. Start it now
+		// while we're in a known-good state.
+		startTemplateThread()
+	}
+	_g_ := getg()
+	_g_.m.lockedExt++
+	if _g_.m.lockedExt == 0 {
+		_g_.m.lockedExt--
+		panic("LockOSThread nesting overflow")
+	}
 	dolockOSThread()
 }
 
 //go:nosplit
 func lockOSThread() {
-	getg().m.locked += _LockInternal
+	getg().m.lockedInt++
 	dolockOSThread()
 }
 
@@ -2973,29 +3281,43 @@ func lockOSThread() {
 //go:nosplit
 func dounlockOSThread() {
 	_g_ := getg()
-	if _g_.m.locked != 0 {
+	if _g_.m.lockedInt != 0 || _g_.m.lockedExt != 0 {
 		return
 	}
-	_g_.m.lockedg = nil
-	_g_.lockedm = nil
+	_g_.m.lockedg = 0
+	_g_.lockedm = 0
 }
 
 //go:nosplit
 
-// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
-// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+// UnlockOSThread undoes an earlier call to LockOSThread.
+// If this drops the number of active LockOSThread calls on the
+// calling goroutine to zero, it unwires the calling goroutine from
+// its fixed operating system thread.
+// If there are no active LockOSThread calls, this is a no-op.
+//
+// Before calling UnlockOSThread, the caller must ensure that the OS
+// thread is suitable for running other goroutines. If the caller made
+// any permanent changes to the state of the thread that would affect
+// other goroutines, it should not call this function and thus leave
+// the goroutine locked to the OS thread until the goroutine (and
+// hence the thread) exits.
 func UnlockOSThread() {
-	getg().m.locked &^= _LockExternal
+	_g_ := getg()
+	if _g_.m.lockedExt == 0 {
+		return
+	}
+	_g_.m.lockedExt--
 	dounlockOSThread()
 }
 
 //go:nosplit
 func unlockOSThread() {
 	_g_ := getg()
-	if _g_.m.locked < _LockInternal {
+	if _g_.m.lockedInt == 0 {
 		systemstack(badunlockosthread)
 	}
-	_g_.m.locked -= _LockInternal
+	_g_.m.lockedInt--
 	dounlockOSThread()
 }
 
@@ -3005,10 +3327,7 @@ func badunlockosthread() {
 
 func gcount() int32 {
 	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
-	for _, _p_ := range &allp {
-		if _p_ == nil {
-			break
-		}
+	for _, _p_ := range allp {
 		n -= _p_.gfreecnt
 	}
 
@@ -3021,7 +3340,7 @@ func gcount() int32 {
 }
 
 func mcount() int32 {
-	return sched.mcount
+	return int32(sched.mnext - sched.nmfreed)
 }
 
 var prof struct {
@@ -3190,7 +3509,7 @@ func setcpuprofilerate(hz int32) {
 // Returns list of Ps with local work, they need to be scheduled by the caller.
 func procresize(nprocs int32) *p {
 	old := gomaxprocs
-	if old < 0 || old > _MaxGomaxprocs || nprocs <= 0 || nprocs > _MaxGomaxprocs {
+	if old < 0 || nprocs <= 0 {
 		throw("procresize: invalid arg")
 	}
 	if trace.enabled {
@@ -3204,6 +3523,23 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	// Grow allp if necessary.
+	if nprocs > int32(len(allp)) {
+		// Synchronize with retake, which could be running
+		// concurrently since it doesn't run on a P.
+		lock(&allpLock)
+		if nprocs <= int32(cap(allp)) {
+			allp = allp[:nprocs]
+		} else {
+			nallp := make([]*p, nprocs)
+			// Copy everything up to allp's cap so we
+			// never lose old allocated Ps.
+			copy(nallp, allp[:cap(allp)])
+			allp = nallp
+		}
+		unlock(&allpLock)
+	}
+
 	// initialize new P's
 	for i := int32(0); i < nprocs; i++ {
 		pp := allp[i]
@@ -3213,6 +3549,7 @@ func procresize(nprocs int32) *p {
 			pp.status = _Pgcstop
 			pp.sudogcache = pp.sudogbuf[:0]
 			pp.deferpool = pp.deferpoolbuf[:0]
+			pp.wbBuf.reset()
 			atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
 		}
 		if pp.mcache == nil {
@@ -3230,13 +3567,11 @@ func procresize(nprocs int32) *p {
 	// free unused P's
 	for i := nprocs; i < old; i++ {
 		p := allp[i]
-		if trace.enabled {
-			if p == getg().m.p.ptr() {
-				// moving to p[0], pretend that we were descheduled
-				// and then scheduled again to keep the trace sane.
-				traceGoSched()
-				traceProcStop(p)
-			}
+		if trace.enabled && p == getg().m.p.ptr() {
+			// moving to p[0], pretend that we were descheduled
+			// and then scheduled again to keep the trace sane.
+			traceGoSched()
+			traceProcStop(p)
 		}
 		// move all runnable goroutines to the global queue
 		for p.runqhead != p.runqtail {
@@ -3262,6 +3597,11 @@ func procresize(nprocs int32) *p {
 			// world is stopped.
 			p.gcBgMarkWorker.set(nil)
 		}
+		// Flush p's write barrier buffer.
+		if gcphase != _GCoff {
+			wbBufFlush1(p)
+			p.gcw.dispose()
+		}
 		for i := range p.sudogbuf {
 			p.sudogbuf[i] = nil
 		}
@@ -3274,10 +3614,18 @@ func procresize(nprocs int32) *p {
 		p.mcache = nil
 		gfpurge(p)
 		traceProcFree(p)
+		p.gcAssistTime = 0
 		p.status = _Pdead
 		// can't free P itself because it can be referenced by an M in syscall
 	}
 
+	// Trim allp.
+	if int32(len(allp)) != nprocs {
+		lock(&allpLock)
+		allp = allp[:nprocs]
+		unlock(&allpLock)
+	}
+
 	_g_ := getg()
 	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
 		// continue to use the current P
@@ -3349,7 +3697,7 @@ func acquirep1(_p_ *p) {
 		throw("acquirep: already in go")
 	}
 	if _p_.m != 0 || _p_.status != _Pidle {
-		id := int32(0)
+		id := int64(0)
 		if _p_.m != 0 {
 			id = _p_.m.ptr().id
 		}
@@ -3394,6 +3742,7 @@ func incidlelocked(v int32) {
 
 // Check for deadlock situation.
 // The check is based on number of running M's, if 0 -> deadlock.
+// sched.lock must be held.
 func checkdead() {
 	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
 	// there are no running goroutines. The calling program is
@@ -3410,13 +3759,12 @@ func checkdead() {
 		return
 	}
 
-	// -1 for sysmon
-	run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
+	run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys
 	if run > 0 {
 		return
 	}
 	if run < 0 {
-		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
+		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n")
 		throw("checkdead: inconsistent counts")
 	}
 
@@ -3479,6 +3827,11 @@ var forcegcperiod int64 = 2 * 60 * 1e9
 //
 //go:nowritebarrierrec
 func sysmon() {
+	lock(&sched.lock)
+	sched.nmsys++
+	checkdead()
+	unlock(&sched.lock)
+
 	// If a heap span goes unused for 5 minutes after a garbage collection,
 	// we hand it back to the operating system.
 	scavengelimit := int64(5 * 60 * 1e9)
@@ -3518,15 +3871,11 @@ func sysmon() {
 				}
 				shouldRelax := true
 				if osRelaxMinNS > 0 {
-					lock(&timers.lock)
-					if timers.sleeping {
-						now := nanotime()
-						next := timers.sleepUntil
-						if next-now < osRelaxMinNS {
-							shouldRelax = false
-						}
+					next := timeSleepUntil()
+					now := nanotime()
+					if next-now < osRelaxMinNS {
+						shouldRelax = false
 					}
-					unlock(&timers.lock)
 				}
 				if shouldRelax {
 					osRelax(true)
@@ -3550,7 +3899,7 @@ func sysmon() {
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
 		now := nanotime()
-		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
+		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
 			gp := netpoll(false) // non-blocking - returns list of goroutines
 			if gp != nil {
@@ -3607,9 +3956,17 @@ const forcePreemptNS = 10 * 1000 * 1000 // 10ms
 
 func retake(now int64) uint32 {
 	n := 0
-	for i := int32(0); i < gomaxprocs; i++ {
+	// Prevent allp slice changes. This lock will be completely
+	// uncontended unless we're already stopping the world.
+	lock(&allpLock)
+	// We can't use a range loop over allp because we may
+	// temporarily drop the allpLock. Hence, we need to re-fetch
+	// allp each time around the loop.
+	for i := 0; i < len(allp); i++ {
 		_p_ := allp[i]
 		if _p_ == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
 			continue
 		}
 		pd := &_p_.sysmontick
@@ -3628,6 +3985,8 @@ func retake(now int64) uint32 {
 			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
 				continue
 			}
+			// Drop allpLock so we can take sched.lock.
+			unlock(&allpLock)
 			// Need to decrement number of idle locked M's
 			// (pretending that one more is running) before the CAS.
 			// Otherwise the M from which we retake can exit the syscall,
@@ -3643,6 +4002,7 @@ func retake(now int64) uint32 {
 				handoffp(_p_)
 			}
 			incidlelocked(1)
+			lock(&allpLock)
 		} else if s == _Prunning {
 			// Preempt G if it's running for too long.
 			t := int64(_p_.schedtick)
@@ -3657,6 +4017,7 @@ func retake(now int64) uint32 {
 			preemptone(_p_)
 		}
 	}
+	unlock(&allpLock)
 	return uint32(n)
 }
 
@@ -3667,9 +4028,8 @@ func retake(now int64) uint32 {
 // Returns true if preemption request was issued to at least one goroutine.
 func preemptall() bool {
 	res := false
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil || _p_.status != _Prunning {
+	for _, _p_ := range allp {
+		if _p_.status != _Prunning {
 			continue
 		}
 		if preemptone(_p_) {
@@ -3727,23 +4087,19 @@ func schedtrace(detailed bool) {
 	}
 
 	lock(&sched.lock)
-	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", mcount(), " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
 	if detailed {
 		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
 	}
 	// We must be careful while reading data from P's, M's and G's.
 	// Even if we hold schedlock, most data can be changed concurrently.
 	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
-			continue
-		}
+	for i, _p_ := range allp {
 		mp := _p_.m.ptr()
 		h := atomic.Load(&_p_.runqhead)
 		t := atomic.Load(&_p_.runqtail)
 		if detailed {
-			id := int32(-1)
+			id := int64(-1)
 			if mp != nil {
 				id = mp.id
 			}
@@ -3756,7 +4112,7 @@ func schedtrace(detailed bool) {
 				print("[")
 			}
 			print(t - h)
-			if i == gomaxprocs-1 {
+			if i == len(allp)-1 {
 				print("]\n")
 			}
 		}
@@ -3770,7 +4126,7 @@ func schedtrace(detailed bool) {
 	for mp := allm; mp != nil; mp = mp.alllink {
 		_p_ := mp.p.ptr()
 		gp := mp.curg
-		lockedg := mp.lockedg
+		lockedg := mp.lockedg.ptr()
 		id1 := int32(-1)
 		if _p_ != nil {
 			id1 = _p_.id
@@ -3790,12 +4146,12 @@ func schedtrace(detailed bool) {
 	for gi := 0; gi < len(allgs); gi++ {
 		gp := allgs[gi]
 		mp := gp.m
-		lockedm := gp.lockedm
-		id1 := int32(-1)
+		lockedm := gp.lockedm.ptr()
+		id1 := int64(-1)
 		if mp != nil {
 			id1 = mp.id
 		}
-		id2 := int32(-1)
+		id2 := int64(-1)
 		if lockedm != nil {
 			id2 = lockedm.id
 		}
@@ -4077,22 +4433,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			if stealRunNextG {
 				// Try to steal from _p_.runnext.
 				if next := _p_.runnext; next != 0 {
-					// Sleep to ensure that _p_ isn't about to run the g we
-					// are about to steal.
-					// The important use case here is when the g running on _p_
-					// ready()s another g and then almost immediately blocks.
-					// Instead of stealing runnext in this window, back off
-					// to give _p_ a chance to schedule runnext. This will avoid
-					// thrashing gs between different Ps.
-					// A sync chan send/recv takes ~50ns as of time of writing,
-					// so 3us gives ~50x overshoot.
-					if GOOS != "windows" {
-						usleep(3)
-					} else {
-						// On windows system timer granularity is 1-15ms,
-						// which is way too much for this optimization.
-						// So just yield.
-						osyield()
+					if _p_.status == _Prunning {
+						// Sleep to ensure that _p_ isn't about to run the g
+						// we are about to steal.
+						// The important use case here is when the g running
+						// on _p_ ready()s another g and then almost
+						// immediately blocks. Instead of stealing runnext
+						// in this window, back off to give _p_ a chance to
+						// schedule runnext. This will avoid thrashing gs
+						// between different Ps.
+						// A sync chan send/recv takes ~50ns as of time of
+						// writing, so 3us gives ~50x overshoot.
+						if GOOS != "windows" {
+							usleep(3)
+						} else {
+							// On windows system timer granularity is
+							// 1-15ms, which is way too much for this
+							// optimization. So just yield.
+							osyield()
+						}
 					}
 					if !_p_.runnext.cas(next, 0) {
 						continue
diff --git a/libgo/go/runtime/proc_runtime_test.go b/libgo/go/runtime/proc_runtime_test.go
index d56f9b14636..a7bde2c6df7 100644
--- a/libgo/go/runtime/proc_runtime_test.go
+++ b/libgo/go/runtime/proc_runtime_test.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build ignore
-
 // Proc unit tests. In runtime package so can use runtime guts.
 
 package runtime
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index 313a9610e0e..672e1fa0148 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -658,6 +658,116 @@ func BenchmarkClosureCall(b *testing.B) {
 	_ = sum
 }
 
+func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
+	if runtime.GOMAXPROCS(0) == 1 {
+		b.Skip("skipping: GOMAXPROCS=1")
+	}
+
+	wakeDelay := 5 * time.Microsecond
+	for _, delay := range []time.Duration{
+		0,
+		1 * time.Microsecond,
+		2 * time.Microsecond,
+		5 * time.Microsecond,
+		10 * time.Microsecond,
+		20 * time.Microsecond,
+		50 * time.Microsecond,
+		100 * time.Microsecond,
+	} {
+		b.Run(delay.String(), func(b *testing.B) {
+			if b.N == 0 {
+				return
+			}
+			// Start two goroutines, which alternate between being
+			// sender and receiver in the following protocol:
+			//
+			// - The receiver spins for `delay` and then does a
+			// blocking receive on a channel.
+			//
+			// - The sender spins for `delay+wakeDelay` and then
+			// sends to the same channel. (The addition of
+			// `wakeDelay` improves the probability that the
+			// receiver will be blocking when the send occurs when
+			// the goroutines execute in parallel.)
+			//
+			// In each iteration of the benchmark, each goroutine
+			// acts once as sender and once as receiver, so each
+			// goroutine spins for delay twice.
+			//
+			// BenchmarkWakeupParallel is used to estimate how
+			// efficiently the scheduler parallelizes goroutines in
+			// the presence of blocking:
+			//
+			// - If both goroutines are executed on the same core,
+			// an increase in delay by N will increase the time per
+			// iteration by 4*N, because all 4 delays are
+			// serialized.
+			//
+			// - Otherwise, an increase in delay by N will increase
+			// the time per iteration by 2*N, and the time per
+			// iteration is 2 * (runtime overhead + chan
+			// send/receive pair + delay + wakeDelay). This allows
+			// the runtime overhead, including the time it takes
+			// for the unblocked goroutine to be scheduled, to be
+			// estimated.
+			ping, pong := make(chan struct{}), make(chan struct{})
+			start := make(chan struct{})
+			done := make(chan struct{})
+			go func() {
+				<-start
+				for i := 0; i < b.N; i++ {
+					// sender
+					spin(delay + wakeDelay)
+					ping <- struct{}{}
+					// receiver
+					spin(delay)
+					<-pong
+				}
+				done <- struct{}{}
+			}()
+			go func() {
+				for i := 0; i < b.N; i++ {
+					// receiver
+					spin(delay)
+					<-ping
+					// sender
+					spin(delay + wakeDelay)
+					pong <- struct{}{}
+				}
+				done <- struct{}{}
+			}()
+			b.ResetTimer()
+			start <- struct{}{}
+			<-done
+			<-done
+		})
+	}
+}
+
+func BenchmarkWakeupParallelSpinning(b *testing.B) {
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		end := time.Now().Add(d)
+		for time.Now().Before(end) {
+			// do nothing
+		}
+	})
+}
+
+// sysNanosleep is defined by OS-specific files (such as runtime_linux_test.go)
+// to sleep for the given duration. If nil, dependent tests are skipped.
+// The implementation should invoke a blocking system call and not
+// call time.Sleep, which would deschedule the goroutine.
+var sysNanosleep func(d time.Duration)
+
+func BenchmarkWakeupParallelSyscall(b *testing.B) {
+	if sysNanosleep == nil {
+		b.Skipf("skipping on %v; sysNanosleep not defined", runtime.GOOS)
+	}
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		sysNanosleep(d)
+	})
+}
+
 type Matrix [][]float64
 
 func BenchmarkMatmult(b *testing.B) {
@@ -722,8 +832,47 @@ func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, thres
 	}
 }
 
-/*
 func TestStealOrder(t *testing.T) {
 	runtime.RunStealOrderTest()
 }
-*/
+
+func TestLockOSThreadNesting(t *testing.T) {
+	go func() {
+		e, i := runtime.LockOSCounts()
+		if e != 0 || i != 0 {
+			t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
+			return
+		}
+		runtime.LockOSThread()
+		runtime.LockOSThread()
+		runtime.UnlockOSThread()
+		e, i = runtime.LockOSCounts()
+		if e != 1 || i != 0 {
+			t.Errorf("want locked counts 1, 0; got %d, %d", e, i)
+			return
+		}
+		runtime.UnlockOSThread()
+		e, i = runtime.LockOSCounts()
+		if e != 0 || i != 0 {
+			t.Errorf("want locked counts 0, 0; got %d, %d", e, i)
+			return
+		}
+	}()
+}
+
+func TestLockOSThreadExit(t *testing.T) {
+	testLockOSThreadExit(t, "testprog")
+}
+
+func testLockOSThreadExit(t *testing.T, prog string) {
+	output := runTestProg(t, prog, "LockOSThreadMain", "GOMAXPROCS=1")
+	want := "OK\n"
+	if output != want {
+		t.Errorf("want %s, got %s\n", want, output)
+	}
+
+	output = runTestProg(t, prog, "LockOSThreadAlt")
+	if output != want {
+		t.Errorf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/libgo/go/runtime/runtime-lldb_test.go b/libgo/go/runtime/runtime-lldb_test.go
index 98bc9066662..9a287052eaf 100644
--- a/libgo/go/runtime/runtime-lldb_test.go
+++ b/libgo/go/runtime/runtime-lldb_test.go
@@ -5,11 +5,7 @@
 package runtime_test
 
 import (
-	"debug/elf"
-	"debug/macho"
-	"encoding/binary"
 	"internal/testenv"
-	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -158,7 +154,7 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("failed to create file: %v", err)
 	}
 
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags", "-N -l", "-o", "a.exe")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-gcflags=all=-N -l", "-o", "a.exe")
 	cmd.Dir = dir
 	out, err := cmd.CombinedOutput()
 	if err != nil {
@@ -182,81 +178,3 @@ func TestLldbPython(t *testing.T) {
 		t.Fatalf("Unexpected lldb output:\n%s", got)
 	}
 }
-
-// Check that aranges are valid even when lldb isn't installed.
-func TestDwarfAranges(t *testing.T) {
-	testenv.MustHaveGoBuild(t)
-	dir, err := ioutil.TempDir("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	src := filepath.Join(dir, "main.go")
-	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
-	if err != nil {
-		t.Fatalf("failed to create file: %v", err)
-	}
-
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe")
-	cmd.Dir = dir
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("building source %v\n%s", err, out)
-	}
-
-	filename := filepath.Join(dir, "a.exe")
-	if f, err := elf.Open(filename); err == nil {
-		sect := f.Section(".debug_aranges")
-		if sect == nil {
-			t.Fatal("Missing aranges section")
-		}
-		verifyAranges(t, f.ByteOrder, sect.Open())
-	} else if f, err := macho.Open(filename); err == nil {
-		sect := f.Section("__debug_aranges")
-		if sect == nil {
-			t.Fatal("Missing aranges section")
-		}
-		verifyAranges(t, f.ByteOrder, sect.Open())
-	} else {
-		t.Skip("Not an elf or macho binary.")
-	}
-}
-
-func verifyAranges(t *testing.T, byteorder binary.ByteOrder, data io.ReadSeeker) {
-	var header struct {
-		UnitLength  uint32 // does not include the UnitLength field
-		Version     uint16
-		Offset      uint32
-		AddressSize uint8
-		SegmentSize uint8
-	}
-	for {
-		offset, err := data.Seek(0, io.SeekCurrent)
-		if err != nil {
-			t.Fatalf("Seek error: %v", err)
-		}
-		if err = binary.Read(data, byteorder, &header); err == io.EOF {
-			return
-		} else if err != nil {
-			t.Fatalf("Error reading arange header: %v", err)
-		}
-		tupleSize := int64(header.SegmentSize) + 2*int64(header.AddressSize)
-		lastTupleOffset := offset + int64(header.UnitLength) + 4 - tupleSize
-		if lastTupleOffset%tupleSize != 0 {
-			t.Fatalf("Invalid arange length %d, (addr %d, seg %d)", header.UnitLength, header.AddressSize, header.SegmentSize)
-		}
-		if _, err = data.Seek(lastTupleOffset, io.SeekStart); err != nil {
-			t.Fatalf("Seek error: %v", err)
-		}
-		buf := make([]byte, tupleSize)
-		if n, err := data.Read(buf); err != nil || int64(n) < tupleSize {
-			t.Fatalf("Read error: %v", err)
-		}
-		for _, val := range buf {
-			if val != 0 {
-				t.Fatalf("Invalid terminator")
-			}
-		}
-	}
-}
diff --git a/libgo/go/runtime/runtime.go b/libgo/go/runtime/runtime.go
index 58710de406c..d19d6afed38 100644
--- a/libgo/go/runtime/runtime.go
+++ b/libgo/go/runtime/runtime.go
@@ -61,6 +61,12 @@ func syscall_Getpagesize() int { return int(physPageSize) }
 //go:linkname os_runtime_args os.runtime_args
 func os_runtime_args() []string { return append([]string{}, argslice...) }
 
+//go:linkname syscall_Exit syscall.Exit
+//go:nosplit
+func syscall_Exit(code int) {
+	exit(int32(code))
+}
+
 // Temporary, for the gccgo runtime code written in C.
 //go:linkname get_envs runtime_get_envs
 func get_envs() []string { return envs }
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 627adf74765..b617f8598fa 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -111,10 +111,6 @@ var test_z64, test_x64 uint64
 func testAtomic64() {
 	test_z64 = 42
 	test_x64 = 0
-	prefetcht0(uintptr(unsafe.Pointer(&test_z64)))
-	prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
-	prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
-	prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
 	if atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
@@ -413,13 +409,6 @@ func parsedebugvars() {
 
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
-
-	// For cgocheck > 1, we turn on the write barrier at all times
-	// and check all pointer writes.
-	if debug.cgocheck > 1 {
-		writeBarrier.cgo = true
-		writeBarrier.enabled = true
-	}
 }
 
 //go:linkname setTraceback runtime_debug.SetTraceback
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 045e76ff4df..543086d09aa 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -173,9 +173,13 @@ func efaceOf(ep *interface{}) *eface {
 // a word that is completely ignored by the GC than to have one for which
 // only a few updates are ignored.
 //
-// Gs, Ms, and Ps are always reachable via true pointers in the
-// allgs, allm, and allp lists or (during allocation before they reach those lists)
+// Gs and Ps are always reachable via true pointers in the
+// allgs and allp lists or (during allocation before they reach those lists)
 // from stack variables.
+//
+// Ms are always reachable via true pointers either from allm or
+// freem. Unlike Gs and Ps we do free Ms, so it's important that
+// nothing ever hold an muintptr across a safe point.
 
 // A guintptr holds a goroutine pointer, but typed as a uintptr
 // to bypass write barriers. It is used in the Gobuf goroutine state
@@ -225,6 +229,15 @@ func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
 //go:nosplit
 func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
 
+// muintptr is a *m that is not tracked by the garbage collector.
+//
+// Because we do free Ms, there are some additional constrains on
+// muintptrs:
+//
+// 1. Never hold an muintptr locally across a safe point.
+//
+// 2. Any muintptr in the heap must be owned by the M itself so it can
+//    ensure it is not in use when the last true *m is released.
 type muintptr uintptr
 
 //go:nosplit
@@ -256,11 +269,14 @@ type sudog struct {
 	// channel this sudog is blocking on. shrinkstack depends on
 	// this for sudogs involved in channel ops.
 
-	g          *g
-	selectdone *uint32 // CAS to 1 to win select race (may point to stack)
-	next       *sudog
-	prev       *sudog
-	elem       unsafe.Pointer // data element (may point to stack)
+	g *g
+
+	// isSelect indicates g is participating in a select, so
+	// g.selectDone must be CAS'd to win the wake-up race.
+	isSelect bool
+	next     *sudog
+	prev     *sudog
+	elem     unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
 	// For channels, waitlink is only accessed by g.
@@ -351,7 +367,7 @@ type g struct {
 	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
 	traceseq       uint64   // trace event sequencer
 	tracelastp     puintptr // last P emitted an event for this goroutine
-	lockedm        *m
+	lockedm        muintptr
 	sig            uint32
 	writebuf       []byte
 	sigcode0       uintptr
@@ -362,8 +378,9 @@ type g struct {
 	// Not for gccgo: racectx        uintptr
 	waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
 	// Not for gccgo: cgoCtxt        []uintptr      // cgo traceback context
-	labels unsafe.Pointer // profiler labels
-	timer  *timer         // cached timer for time.Sleep
+	labels     unsafe.Pointer // profiler labels
+	timer      *timer         // cached timer for time.Sleep
+	selectDone uint32         // are we participating in a select and did someone win the race?
 
 	// Per-G GC state
 
@@ -381,13 +398,26 @@ type g struct {
 	exception unsafe.Pointer // current exception being thrown
 	isforeign bool           // whether current exception is not from Go
 
-	// Fields that hold stack and context information if status is Gsyscall
+	// When using split-stacks, these fields holds the results of
+	// __splitstack_find while executing a syscall. These are used
+	// by the garbage collector to scan the goroutine's stack.
+	//
+	// When not using split-stacks, g0 stacks are allocated by the
+	// libc and other goroutine stacks are allocated by malg.
+	// gcstack: unused (sometimes cleared)
+	// gcstacksize: g0: 0; others: size of stack
+	// gcnextsegment: unused
+	// gcnextsp: current SP while executing a syscall
+	// gcinitialsp: g0: top of stack; others: start of stack memory
 	gcstack       uintptr
 	gcstacksize   uintptr
 	gcnextsegment uintptr
 	gcnextsp      uintptr
 	gcinitialsp   unsafe.Pointer
-	gcregs        g_ucontext_t
+
+	// gcregs holds the register values while executing a syscall.
+	// This is set by getcontext and scanned by the garbage collector.
+	gcregs g_ucontext_t
 
 	entry    func(unsafe.Pointer) // goroutine function to run
 	entryfn  uintptr              // function address passed to __go_go
@@ -411,14 +441,15 @@ type m struct {
 	// Fields not known to debuggers.
 	procid  uint64 // for debuggers, but offset not hard-coded
 	gsignal *g     // signal-handling g
+	// Not for gccgo: goSigStack    gsignalStack // Go-allocated signal handling stack
 	sigmask sigset // storage for saved signal mask
-	// Not for gccgo: tls           [6]uintptr // thread-local storage (for x86 extern register)
+	// Not for gccgo: tls           [6]uintptr   // thread-local storage (for x86 extern register)
 	mstartfn    func()
 	curg        *g       // current running goroutine
 	caughtsig   guintptr // goroutine running during fatal signal
 	p           puintptr // attached p for executing go code (nil if not executing go code)
 	nextp       puintptr
-	id          int32
+	id          int64
 	mallocing   int32
 	throwing    int32
 	preemptoff  string // if != "", keep curg running on this m
@@ -432,8 +463,11 @@ type m struct {
 	inwb        bool // m is executing a write barrier
 	newSigstack bool // minit on C thread called sigaltstack
 	printlock   int8
-	incgo       bool // m is executing a cgo call
-	fastrand    uint32
+	incgo       bool   // m is executing a cgo call
+	freeWait    uint32 // if == 0, safe to free g0 and delete m (atomic)
+	fastrand    [2]uint32
+	needextram  bool
+	traceback   uint8
 	ncgocall    uint64 // number of cgo calls in total
 	ncgo        int32  // number of cgo calls currently in progress
 	// Not for gccgo: cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
@@ -442,15 +476,14 @@ type m struct {
 	alllink     *m // on allm
 	schedlink   muintptr
 	mcache      *mcache
-	lockedg     *g
+	lockedg     guintptr
 	createstack [32]location // stack that created this thread.
-	// Not for gccgo: freglo        [16]uint32  // d[i] lsb and f[i]
-	// Not for gccgo: freghi        [16]uint32  // d[i] msb and f[i+16]
-	// Not for gccgo: fflag         uint32      // floating point compare flags
-	locked        uint32  // tracking for lockosthread
-	nextwaitm     uintptr // next m waiting for lock
-	needextram    bool
-	traceback     uint8
+	// Not for gccgo: freglo        [16]uint32     // d[i] lsb and f[i]
+	// Not for gccgo: freghi        [16]uint32     // d[i] msb and f[i+16]
+	// Not for gccgo: fflag         uint32         // floating point compare flags
+	lockedExt     uint32         // tracking for external LockOSThread
+	lockedInt     uint32         // tracking for internal lockOSThread
+	nextwaitm     muintptr       // next m waiting for lock
 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
 	waitlock      unsafe.Pointer
 	waittraceev   byte
@@ -458,6 +491,7 @@ type m struct {
 	startingtrace bool
 	syscalltick   uint32
 	// Not for gccgo: thread        uintptr // thread handle
+	freelink *m // on sched.freem
 
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
@@ -475,6 +509,7 @@ type m struct {
 	gsignalstacksize uintptr
 
 	dropextram bool // drop after call is done
+	exiting    bool // thread is exiting
 
 	gcing int32
 }
@@ -490,7 +525,7 @@ type p struct {
 	sysmontick  sysmontick // last tick observed by sysmon
 	m           muintptr   // back-link to associated m (nil if idle)
 	mcache      *mcache
-	// Not for gccgo: racectx     uintptr
+	racectx     uintptr
 
 	// gccgo has only one size of defer.
 	deferpool    []*_defer
@@ -535,26 +570,30 @@ type p struct {
 	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
-	gcAssistTime     int64 // Nanoseconds in assistAlloc
-	gcBgMarkWorker   guintptr
-	gcMarkWorkerMode gcMarkWorkerMode
+	gcAssistTime         int64 // Nanoseconds in assistAlloc
+	gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker
+	gcBgMarkWorker       guintptr
+	gcMarkWorkerMode     gcMarkWorkerMode
+
+	// gcMarkWorkerStartTime is the nanotime() at which this mark
+	// worker started.
+	gcMarkWorkerStartTime int64
 
 	// gcw is this P's GC work buffer cache. The work buffer is
 	// filled by write barriers, drained by mutator assists, and
 	// disposed on certain GC state transitions.
 	gcw gcWork
 
+	// wbBuf is this P's GC write barrier buffer.
+	//
+	// TODO: Consider caching this in the running G.
+	wbBuf wbBuf
+
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
 	pad [sys.CacheLineSize]byte
 }
 
-const (
-	// The max value of GOMAXPROCS.
-	// There are no fundamental restrictions on the value.
-	_MaxGomaxprocs = 1 << 10
-)
-
 type schedt struct {
 	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
 	goidgen  uint64
@@ -562,11 +601,16 @@ type schedt struct {
 
 	lock mutex
 
+	// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
+	// sure to call checkdead().
+
 	midle        muintptr // idle m's waiting for work
 	nmidle       int32    // number of idle m's waiting for work
 	nmidlelocked int32    // number of locked m's waiting for work
-	mcount       int32    // number of m's that have been created
+	mnext        int64    // number of m's that have been created and next M ID
 	maxmcount    int32    // maximum number of m's allowed (or die)
+	nmsys        int32    // number of system m's not counted for deadlock
+	nmfreed      int64    // cumulative number of freed m's
 
 	ngsys uint32 // number of system goroutines; updated atomically
 
@@ -592,6 +636,10 @@ type schedt struct {
 	deferlock mutex
 	deferpool *_defer
 
+	// freem is the list of m's waiting to be freed when their
+	// m.exited is set. Linked through m.freelink.
+	freem *m
+
 	gcwaiting  uint32 // gc is waiting to run
 	stopwait   int32
 	stopnote   note
@@ -610,18 +658,7 @@ type schedt struct {
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
 
-// The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
-// The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
-// External locks are not recursive; a second lock is silently ignored.
-// The upper bits of m.locked record the nesting depth of calls to lockOSThread
-// (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
-// Internal locks can be recursive. For instance, a lock for cgo can occur while the main
-// goroutine is holding the lock during the initialization phase.
-const (
-	_LockExternal = 1
-	_LockInternal = 2
-)
-
+// Values for the flags field of a sigTabT.
 const (
 	_SigNotify   = 1 << iota // let signal.Notify have signal, even if from kernel
 	_SigKill                 // if signal.Notify doesn't take it, exit quietly
@@ -630,7 +667,8 @@ const (
 	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
 	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
 	_SigSetStack             // add SA_ONSTACK to libc handler
-	_SigUnblock              // unblocked in minit
+	_SigUnblock              // always unblock; see blockableSig
+	_SigIgn                  // _SIG_DFL action is to ignore the signal
 )
 
 // Lock-free stack node.
@@ -671,8 +709,8 @@ func extendRandom(r []byte, n int) {
 	}
 }
 
-// deferred subroutine calls
-// This is the gccgo version.
+// A _defer holds an entry on the list of deferred calls.
+// If you add a field here, add code to clear it in freedefer.
 type _defer struct {
 	// The next entry in the stack.
 	link *_defer
@@ -743,7 +781,8 @@ const _TracebackMaxFrames = 100
 var (
 	allglen    uintptr
 	allm       *m
-	allp       [_MaxGomaxprocs + 1]*p
+	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
+	allpLock   mutex // Protects P-less reads of allp and all writes
 	gomaxprocs int32
 	ncpu       int32
 	forcegc    forcegcstate
diff --git a/libgo/go/runtime/runtime_mmap_test.go b/libgo/go/runtime/runtime_mmap_test.go
index 0141e81d4a0..c0040414d46 100644
--- a/libgo/go/runtime/runtime_mmap_test.go
+++ b/libgo/go/runtime/runtime_mmap_test.go
@@ -14,17 +14,10 @@ import (
 // what the code in mem_bsd.go, mem_darwin.go, and mem_linux.go expects.
 // See the uses of ENOMEM in sysMap in those files.
 func TestMmapErrorSign(t *testing.T) {
-	p := runtime.Mmap(nil, ^uintptr(0)&^(runtime.GetPhysPageSize()-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+	p, err := runtime.Mmap(nil, ^uintptr(0)&^(runtime.GetPhysPageSize()-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
 
-	// The runtime.mmap function is nosplit, but t.Errorf is not.
-	// Reset the pointer so that we don't get an "invalid stack
-	// pointer" error from t.Errorf if we call it.
-	v := uintptr(p)
-	p = nil
-
-	err := runtime.Errno()
-	if v != ^uintptr(0) || err != runtime.ENOMEM {
-		t.Errorf("mmap = %v, %v, want %v", v, err, runtime.ENOMEM)
+	if p != nil || err != runtime.ENOMEM {
+		t.Errorf("mmap = %v, %v, want nil, %v", p, err, runtime.ENOMEM)
 	}
 }
 
@@ -34,20 +27,20 @@ func TestPhysPageSize(t *testing.T) {
 	ps := runtime.GetPhysPageSize()
 
 	// Get a region of memory to play with. This should be page-aligned.
-	b := uintptr(runtime.Mmap(nil, 2*ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0))
-	if b == ^uintptr(0) {
-		t.Fatalf("Mmap: %v %v", b, runtime.Errno())
+	b, err := runtime.Mmap(nil, 2*ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+	if err != 0 {
+		t.Fatalf("Mmap: %v", err)
 	}
 
 	// Mmap should fail at a half page into the buffer.
-	err := uintptr(runtime.Mmap(unsafe.Pointer(uintptr(b)+ps/2), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0))
-	if err != ^uintptr(0) {
+	_, err = runtime.Mmap(unsafe.Pointer(uintptr(b)+ps/2), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0)
+	if err == 0 {
 		t.Errorf("Mmap should have failed with half-page alignment %d, but succeeded: %v", ps/2, err)
 	}
 
 	// Mmap should succeed at a full page into the buffer.
-	err = uintptr(runtime.Mmap(unsafe.Pointer(uintptr(b)+ps), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0))
-	if err == ^uintptr(0) {
-		t.Errorf("Mmap at full-page alignment %d failed: %v %v", ps, err, runtime.Errno())
+	_, err = runtime.Mmap(unsafe.Pointer(uintptr(b)+ps), ps, 0, runtime.MAP_ANON|runtime.MAP_PRIVATE|runtime.MAP_FIXED, -1, 0)
+	if err != 0 {
+		t.Errorf("Mmap at full-page alignment %d failed: %v", ps, err)
 	}
 }
diff --git a/libgo/go/runtime/runtime_test.go b/libgo/go/runtime/runtime_test.go
index b8f6ac2aed4..0231043260b 100644
--- a/libgo/go/runtime/runtime_test.go
+++ b/libgo/go/runtime/runtime_test.go
@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"flag"
 	"io"
 	. "runtime"
 	"runtime/debug"
@@ -13,6 +14,8 @@ import (
 	"unsafe"
 )
 
+var flagQuick = flag.Bool("quick", false, "skip slow tests, for second run in all.bash")
+
 func init() {
 	// We're testing the runtime, so make tracebacks show things
 	// in the runtime. This only raises the level, so it won't
@@ -196,9 +199,9 @@ func eqstring_generic(s1, s2 string) bool {
 }
 
 func TestEqString(t *testing.T) {
-	// This isn't really an exhaustive test of eqstring, it's
+	// This isn't really an exhaustive test of == on strings, it's
 	// just a convenient way of documenting (via eqstring_generic)
-	// what eqstring does.
+	// what == does.
 	s := []string{
 		"",
 		"a",
@@ -213,7 +216,7 @@ func TestEqString(t *testing.T) {
 			x := s1 == s2
 			y := eqstring_generic(s1, s2)
 			if x != y {
-				t.Errorf(`eqstring("%s","%s") = %t, want %t`, s1, s2, x, y)
+				t.Errorf(`("%s" == "%s") = %t, want %t`, s1, s2, x, y)
 			}
 		}
 	}
diff --git a/libgo/go/runtime/rwmutex_test.go b/libgo/go/runtime/rwmutex_test.go
index a69eca1511f..872b3b098e8 100644
--- a/libgo/go/runtime/rwmutex_test.go
+++ b/libgo/go/runtime/rwmutex_test.go
@@ -12,6 +12,7 @@ package runtime_test
 import (
 	"fmt"
 	. "runtime"
+	"runtime/debug"
 	"sync/atomic"
 	"testing"
 )
@@ -47,6 +48,10 @@ func doTestParallelReaders(numReaders int) {
 
 func TestParallelRWMutexReaders(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(-1))
+	// If runtime triggers a forced GC during this test then it will deadlock,
+	// since the goroutines can't be stopped/preempted.
+	// Disable GC for this test (see issue #10958).
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
 	doTestParallelReaders(1)
 	doTestParallelReaders(3)
 	doTestParallelReaders(4)
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index 9f8ac49d972..096af52be35 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -88,7 +88,7 @@ func newselect(sel *hselect, selsize int64, size int32) {
 }
 
 func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectsend: too many cases")
@@ -109,7 +109,7 @@ func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
 }
 
 func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectrecv: too many cases")
@@ -131,7 +131,7 @@ func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
 }
 
 func selectdefault(sel *hselect) {
-	pc := getcallerpc(unsafe.Pointer(&sel))
+	pc := getcallerpc()
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectdefault: too many cases")
@@ -301,7 +301,6 @@ func selectgo(sel *hselect) int {
 
 	var (
 		gp     *g
-		done   uint32
 		sg     *sudog
 		c      *hchan
 		k      *scase
@@ -368,7 +367,6 @@ loop:
 
 	// pass 2 - enqueue on all chans
 	gp = getg()
-	done = 0
 	if gp.waiting != nil {
 		throw("gp.waiting != nil")
 	}
@@ -382,8 +380,7 @@ loop:
 		c = cas.c
 		sg := acquireSudog()
 		sg.g = gp
-		// Note: selectdone is adjusted for stack copies in stack1.go:adjustsudogs
-		sg.selectdone = (*uint32)(noescape(unsafe.Pointer(&done)))
+		sg.isSelect = true
 		// No stack splits between assigning elem and enqueuing
 		// sg on gp.waiting where copystack can find it.
 		sg.elem = cas.elem
@@ -409,62 +406,9 @@ loop:
 	gp.param = nil
 	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 1)
 
-	// While we were asleep, some goroutine came along and completed
-	// one of the cases in the select and woke us up (called ready).
-	// As part of that process, the goroutine did a cas on done above
-	// (aka *sg.selectdone for all queued sg) to win the right to
-	// complete the select. Now done = 1.
-	//
-	// If we copy (grow) our own stack, we will update the
-	// selectdone pointers inside the gp.waiting sudog list to point
-	// at the new stack. Another goroutine attempting to
-	// complete one of our (still linked in) select cases might
-	// see the new selectdone pointer (pointing at the new stack)
-	// before the new stack has real data; if the new stack has done = 0
-	// (before the old values are copied over), the goroutine might
-	// do a cas via sg.selectdone and incorrectly believe that it has
-	// won the right to complete the select, executing a second
-	// communication and attempting to wake us (call ready) again.
-	//
-	// Then things break.
-	//
-	// The best break is that the goroutine doing ready sees the
-	// _Gcopystack status and throws, as in #17007.
-	// A worse break would be for us to continue on, start running real code,
-	// block in a semaphore acquisition (sema.go), and have the other
-	// goroutine wake us up without having really acquired the semaphore.
-	// That would result in the goroutine spuriously running and then
-	// queue up another spurious wakeup when the semaphore really is ready.
-	// In general the situation can cascade until something notices the
-	// problem and causes a crash.
-	//
-	// A stack shrink does not have this problem, because it locks
-	// all the channels that are involved first, blocking out the
-	// possibility of a cas on selectdone.
-	//
-	// A stack growth before gopark above does not have this
-	// problem, because we hold those channel locks (released by
-	// selparkcommit).
-	//
-	// A stack growth after sellock below does not have this
-	// problem, because again we hold those channel locks.
-	//
-	// The only problem is a stack growth during sellock.
-	// To keep that from happening, run sellock on the system stack.
-	//
-	// It might be that we could avoid this if copystack copied the
-	// stack before calling adjustsudogs. In that case,
-	// syncadjustsudogs would need to recopy the tiny part that
-	// it copies today, resulting in a little bit of extra copying.
-	//
-	// An even better fix, not for the week before a release candidate,
-	// would be to put space in every sudog and make selectdone
-	// point at (say) the space in the first sudog.
-
-	systemstack(func() {
-		sellock(scases, lockorder)
-	})
+	sellock(scases, lockorder)
 
+	gp.selectDone = 0
 	sg = (*sudog)(gp.param)
 	gp.param = nil
 
@@ -477,7 +421,7 @@ loop:
 	sglist = gp.waiting
 	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
-		sg1.selectdone = nil
+		sg1.isSelect = false
 		sg1.elem = nil
 		sg1.c = nil
 	}
@@ -528,10 +472,8 @@ loop:
 		print("wait-return: sel=", sel, " c=", c, " cas=", cas, " kind=", cas.kind, "\n")
 	}
 
-	if cas.kind == caseRecv {
-		if cas.receivedp != nil {
-			*cas.receivedp = true
-		}
+	if cas.kind == caseRecv && cas.receivedp != nil {
+		*cas.receivedp = true
 	}
 
 	if raceenabled {
diff --git a/libgo/go/runtime/sema.go b/libgo/go/runtime/sema.go
index d04e6f592fc..6e2beeccee1 100644
--- a/libgo/go/runtime/sema.go
+++ b/libgo/go/runtime/sema.go
@@ -275,7 +275,10 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
 	// https://en.wikipedia.org/wiki/Treap
 	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
-	s.ticket = fastrand()
+	//
+	// s.ticket compared with zero in couple of places, therefore set lowest bit.
+	// It will not affect treap's quality noticeably.
+	s.ticket = fastrand() | 1
 	s.parent = last
 	*pt = s
 
diff --git a/libgo/go/runtime/signal_gccgo.go b/libgo/go/runtime/signal_gccgo.go
index 056be36a729..6fe7ba10aaf 100644
--- a/libgo/go/runtime/signal_gccgo.go
+++ b/libgo/go/runtime/signal_gccgo.go
@@ -46,11 +46,6 @@ func kill(pid _pid_t, sig uint32) int32
 //extern setitimer
 func setitimer(which int32, new *_itimerval, old *_itimerval) int32
 
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
 type sigctxt struct {
 	info *_siginfo_t
 	ctxt unsafe.Pointer
diff --git a/libgo/go/runtime/signal_sighandler.go b/libgo/go/runtime/signal_sighandler.go
index 378c68e1d90..c042162e7e6 100644
--- a/libgo/go/runtime/signal_sighandler.go
+++ b/libgo/go/runtime/signal_sighandler.go
@@ -92,9 +92,9 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 	}
 
 	print("PC=", hex(sigpc), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
-	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
 		print("signal arrived during cgo execution\n")
-		gp = _g_.m.lockedg
+		gp = _g_.m.lockedg.ptr()
 	}
 	print("\n")
 
@@ -111,7 +111,7 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 
 	if docrash {
 		crashing++
-		if crashing < sched.mcount-int32(extraMCount) {
+		if crashing < mcount()-int32(extraMCount) {
 			// There are other m's that need to dump their stacks.
 			// Relay SIGQUIT to the next m by sending it to the current process.
 			// All m's that have already received SIGQUIT have signal masks blocking
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 3237e18765f..85171484a90 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -8,7 +8,6 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -16,6 +15,16 @@ import (
 //go:linkname initsig runtime.initsig
 //go:linkname sigtrampgo runtime.sigtrampgo
 
+// sigTabT is the type of an entry in the global sigtable array.
+// sigtable is inherently system dependent, and appears in OS-specific files,
+// but sigTabT is the same for all Unixy systems.
+// The sigtable array is indexed by a system signal number to get the flags
+// and printable name of each signal.
+type sigTabT struct {
+	flags int32
+	name  string
+}
+
 //go:linkname os_sigpipe os.sigpipe
 func os_sigpipe() {
 	systemstack(sigpipe)
@@ -275,6 +284,12 @@ func sigpipe() {
 // sigtrampgo is called from the signal handler function, sigtramp,
 // written in assembly code.
 // This is called by the signal handler, and the world may be stopped.
+//
+// It must be nosplit because getg() is still the G that was running
+// (if any) when the signal was delivered, but it's (usually) called
+// on the gsignal stack. Until this switches the G to gsignal, the
+// stack bounds check won't work.
+//
 //go:nosplit
 //go:nowritebarrierrec
 func sigtrampgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) {
@@ -355,8 +370,9 @@ func sigpanic() {
 //go:nosplit
 //go:nowritebarrierrec
 func dieFromSignal(sig uint32) {
-	setsig(sig, _SIG_DFL)
 	unblocksig(sig)
+	// Mark the signal as unhandled to ensure it is forwarded.
+	atomic.Store(&handlingSig[sig], 0)
 	raise(sig)
 
 	// That should have killed us. On some systems, though, raise
@@ -368,6 +384,22 @@ func dieFromSignal(sig uint32) {
 	osyield()
 	osyield()
 
+	// If that didn't work, try _SIG_DFL.
+	setsig(sig, _SIG_DFL)
+	raise(sig)
+
+	osyield()
+	osyield()
+	osyield()
+
+	// On Darwin we may still fail to die, because raise sends the
+	// signal to the whole process rather than just the current thread,
+	// and osyield just sleeps briefly rather than letting all other
+	// threads run. See issue 20315. Sleep longer.
+	if GOOS == "darwin" {
+		usleep(100)
+	}
+
 	// If we are still somehow running, just exit with the wrong status.
 	exit(2)
 }
@@ -434,7 +466,7 @@ func crash() {
 		// this means the OS X core file will be >128 GB and even on a zippy
 		// workstation can take OS X well over an hour to write (uninterruptible).
 		// Save users from making that mistake.
-		if sys.PtrSize == 8 {
+		if GOARCH == "amd64" {
 			return
 		}
 	}
@@ -463,7 +495,7 @@ func ensureSigM() {
 		var sigBlocked sigset
 		sigfillset(&sigBlocked)
 		for i := range sigtable {
-			if sigtable[i].flags&_SigUnblock != 0 {
+			if !blockableSig(uint32(i)) {
 				sigdelset(&sigBlocked, i)
 			}
 		}
@@ -475,7 +507,7 @@ func ensureSigM() {
 					sigdelset(&sigBlocked, int(sig))
 				}
 			case sig := <-disableSigChan:
-				if sig > 0 {
+				if sig > 0 && blockableSig(sig) {
 					sigaddset(&sigBlocked, int(sig))
 				}
 			}
@@ -536,17 +568,23 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return false
 	}
 	fwdFn := atomic.Loaduintptr(&fwdSig[sig])
+	flags := sigtable[sig].flags
 
-	if !signalsOK {
-		// The only way we can get here is if we are in a
-		// library or archive, we installed a signal handler
-		// at program startup, but the Go runtime has not yet
-		// been initialized.
+	// If we aren't handling the signal, forward it.
+	if atomic.Load(&handlingSig[sig]) == 0 || !signalsOK {
+		// If the signal is ignored, doing nothing is the same as forwarding.
+		if fwdFn == _SIG_IGN || (fwdFn == _SIG_DFL && flags&_SigIgn != 0) {
+			return true
+		}
+		// We are not handling the signal and there is no other handler to forward to.
+		// Crash with the default behavior.
 		if fwdFn == _SIG_DFL {
+			setsig(sig, _SIG_DFL)
 			dieFromSignal(sig)
-		} else {
-			sigfwd(fwdFn, sig, info, ctx)
+			return false
 		}
+
+		sigfwd(fwdFn, sig, info, ctx)
 		return true
 	}
 
@@ -555,18 +593,6 @@ func sigfwdgo(sig uint32, info *_siginfo_t, ctx unsafe.Pointer) bool {
 		return false
 	}
 
-	// If we aren't handling the signal, forward it.
-	// Really if we aren't handling the signal, we shouldn't get here,
-	// but on Darwin setsigstack can lead us here because it sets
-	// the sa_tramp field. The sa_tramp field is not returned by
-	// sigaction, so the fix for that is non-obvious.
-	if atomic.Load(&handlingSig[sig]) == 0 {
-		sigfwd(fwdFn, sig, info, ctx)
-		return true
-	}
-
-	flags := sigtable[sig].flags
-
 	c := sigctxt{info, ctx}
 	// Only forward synchronous signals and SIGPIPE.
 	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
@@ -678,7 +704,7 @@ func minitSignalStack() {
 func minitSignalMask() {
 	nmask := getg().m.sigmask
 	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
+		if !blockableSig(uint32(i)) {
 			sigdelset(&nmask, i)
 		}
 	}
@@ -694,3 +720,22 @@ func unminitSignals() {
 		signalstack(nil, 0)
 	}
 }
+
+// blockableSig returns whether sig may be blocked by the signal mask.
+// We never want to block the signals marked _SigUnblock;
+// these are the synchronous signals that turn into a Go panic.
+// In a Go program--not a c-archive/c-shared--we never want to block
+// the signals marked _SigKill or _SigThrow, as otherwise it's possible
+// for all running threads to block them and delay their delivery until
+// we start a new thread. When linked into a C program we let the C code
+// decide on the disposition of those signals.
+func blockableSig(sig uint32) bool {
+	flags := sigtable[sig].flags
+	if flags&_SigUnblock != 0 {
+		return false
+	}
+	if isarchive || islibrary {
+		return true
+	}
+	return flags&(_SigKill|_SigThrow) == 0
+}
diff --git a/libgo/go/runtime/sigqueue.go b/libgo/go/runtime/sigqueue.go
index cd036ce364c..b108c39cc85 100644
--- a/libgo/go/runtime/sigqueue.go
+++ b/libgo/go/runtime/sigqueue.go
@@ -45,13 +45,14 @@ import (
 // as there is no connection between handling a signal and receiving one,
 // but atomic instructions should minimize it.
 var sig struct {
-	note    note
-	mask    [(_NSIG + 31) / 32]uint32
-	wanted  [(_NSIG + 31) / 32]uint32
-	ignored [(_NSIG + 31) / 32]uint32
-	recv    [(_NSIG + 31) / 32]uint32
-	state   uint32
-	inuse   bool
+	note       note
+	mask       [(_NSIG + 31) / 32]uint32
+	wanted     [(_NSIG + 31) / 32]uint32
+	ignored    [(_NSIG + 31) / 32]uint32
+	recv       [(_NSIG + 31) / 32]uint32
+	state      uint32
+	delivering uint32
+	inuse      bool
 }
 
 const (
@@ -60,15 +61,20 @@ const (
 	sigSending
 )
 
-// Called from sighandler to send a signal back out of the signal handling thread.
-// Reports whether the signal was sent. If not, the caller typically crashes the program.
+// sigsend delivers a signal from sighandler to the internal signal delivery queue.
+// It reports whether the signal was sent. If not, the caller typically crashes the program.
+// It runs from the signal handler, so it's limited in what it can do.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
 	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
 		return false
 	}
 
+	atomic.Xadd(&sig.delivering, 1)
+	// We are running in the signal handler; defer is not available.
+
 	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
+		atomic.Xadd(&sig.delivering, -1)
 		return false
 	}
 
@@ -76,6 +82,7 @@ func sigsend(s uint32) bool {
 	for {
 		mask := sig.mask[s/32]
 		if mask&bit != 0 {
+			atomic.Xadd(&sig.delivering, -1)
 			return true // signal already in queue
 		}
 		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
@@ -104,6 +111,7 @@ Send:
 		}
 	}
 
+	atomic.Xadd(&sig.delivering, -1)
 	return true
 }
 
@@ -155,6 +163,15 @@ func signal_recv() uint32 {
 // by the os/signal package.
 //go:linkname signalWaitUntilIdle os_signal.signalWaitUntilIdle
 func signalWaitUntilIdle() {
+	// Although the signals we care about have been removed from
+	// sig.wanted, it is possible that another thread has received
+	// a signal, has read from sig.wanted, is now updating sig.mask,
+	// and has not yet woken up the processor thread. We need to wait
+	// until all current signal deliveries have completed.
+	for atomic.Load(&sig.delivering) != 0 {
+		Gosched()
+	}
+
 	// Although WaitUntilIdle seems like the right name for this
 	// function, the state we are looking for is sigReceiving, not
 	// sigIdle.  The sigIdle state is really more like sigProcessing.
diff --git a/libgo/go/runtime/sizeclasses.go b/libgo/go/runtime/sizeclasses.go
index 5366564afda..9e17b001d3e 100644
--- a/libgo/go/runtime/sizeclasses.go
+++ b/libgo/go/runtime/sizeclasses.go
@@ -3,73 +3,73 @@
 
 package runtime
 
-// class  bytes/obj  bytes/span  objects  waste bytes
-//     1          8        8192     1024            0
-//     2         16        8192      512            0
-//     3         32        8192      256            0
-//     4         48        8192      170           32
-//     5         64        8192      128            0
-//     6         80        8192      102           32
-//     7         96        8192       85           32
-//     8        112        8192       73           16
-//     9        128        8192       64            0
-//    10        144        8192       56          128
-//    11        160        8192       51           32
-//    12        176        8192       46           96
-//    13        192        8192       42          128
-//    14        208        8192       39           80
-//    15        224        8192       36          128
-//    16        240        8192       34           32
-//    17        256        8192       32            0
-//    18        288        8192       28          128
-//    19        320        8192       25          192
-//    20        352        8192       23           96
-//    21        384        8192       21          128
-//    22        416        8192       19          288
-//    23        448        8192       18          128
-//    24        480        8192       17           32
-//    25        512        8192       16            0
-//    26        576        8192       14          128
-//    27        640        8192       12          512
-//    28        704        8192       11          448
-//    29        768        8192       10          512
-//    30        896        8192        9          128
-//    31       1024        8192        8            0
-//    32       1152        8192        7          128
-//    33       1280        8192        6          512
-//    34       1408       16384       11          896
-//    35       1536        8192        5          512
-//    36       1792       16384        9          256
-//    37       2048        8192        4            0
-//    38       2304       16384        7          256
-//    39       2688        8192        3          128
-//    40       3072       24576        8            0
-//    41       3200       16384        5          384
-//    42       3456       24576        7          384
-//    43       4096        8192        2            0
-//    44       4864       24576        5          256
-//    45       5376       16384        3          256
-//    46       6144       24576        4            0
-//    47       6528       32768        5          128
-//    48       6784       40960        6          256
-//    49       6912       49152        7          768
-//    50       8192        8192        1            0
-//    51       9472       57344        6          512
-//    52       9728       49152        5          512
-//    53      10240       40960        4            0
-//    54      10880       32768        3          128
-//    55      12288       24576        2            0
-//    56      13568       40960        3          256
-//    57      14336       57344        4            0
-//    58      16384       16384        1            0
-//    59      18432       73728        4            0
-//    60      19072       57344        3          128
-//    61      20480       40960        2            0
-//    62      21760       65536        3          256
-//    63      24576       24576        1            0
-//    64      27264       81920        3          128
-//    65      28672       57344        2            0
-//    66      32768       32768        1            0
+// class  bytes/obj  bytes/span  objects  tail waste  max waste
+//     1          8        8192     1024           0     87.50%
+//     2         16        8192      512           0     43.75%
+//     3         32        8192      256           0     46.88%
+//     4         48        8192      170          32     31.52%
+//     5         64        8192      128           0     23.44%
+//     6         80        8192      102          32     19.07%
+//     7         96        8192       85          32     15.95%
+//     8        112        8192       73          16     13.56%
+//     9        128        8192       64           0     11.72%
+//    10        144        8192       56         128     11.82%
+//    11        160        8192       51          32      9.73%
+//    12        176        8192       46          96      9.59%
+//    13        192        8192       42         128      9.25%
+//    14        208        8192       39          80      8.12%
+//    15        224        8192       36         128      8.15%
+//    16        240        8192       34          32      6.62%
+//    17        256        8192       32           0      5.86%
+//    18        288        8192       28         128     12.16%
+//    19        320        8192       25         192     11.80%
+//    20        352        8192       23          96      9.88%
+//    21        384        8192       21         128      9.51%
+//    22        416        8192       19         288     10.71%
+//    23        448        8192       18         128      8.37%
+//    24        480        8192       17          32      6.82%
+//    25        512        8192       16           0      6.05%
+//    26        576        8192       14         128     12.33%
+//    27        640        8192       12         512     15.48%
+//    28        704        8192       11         448     13.93%
+//    29        768        8192       10         512     13.94%
+//    30        896        8192        9         128     15.52%
+//    31       1024        8192        8           0     12.40%
+//    32       1152        8192        7         128     12.41%
+//    33       1280        8192        6         512     15.55%
+//    34       1408       16384       11         896     14.00%
+//    35       1536        8192        5         512     14.00%
+//    36       1792       16384        9         256     15.57%
+//    37       2048        8192        4           0     12.45%
+//    38       2304       16384        7         256     12.46%
+//    39       2688        8192        3         128     15.59%
+//    40       3072       24576        8           0     12.47%
+//    41       3200       16384        5         384      6.22%
+//    42       3456       24576        7         384      8.83%
+//    43       4096        8192        2           0     15.60%
+//    44       4864       24576        5         256     16.65%
+//    45       5376       16384        3         256     10.92%
+//    46       6144       24576        4           0     12.48%
+//    47       6528       32768        5         128      6.23%
+//    48       6784       40960        6         256      4.36%
+//    49       6912       49152        7         768      3.37%
+//    50       8192        8192        1           0     15.61%
+//    51       9472       57344        6         512     14.28%
+//    52       9728       49152        5         512      3.64%
+//    53      10240       40960        4           0      4.99%
+//    54      10880       32768        3         128      6.24%
+//    55      12288       24576        2           0     11.45%
+//    56      13568       40960        3         256      9.99%
+//    57      14336       57344        4           0      5.35%
+//    58      16384       16384        1           0     12.49%
+//    59      18432       73728        4           0     11.11%
+//    60      19072       57344        3         128      3.57%
+//    61      20480       40960        2           0      6.87%
+//    62      21760       65536        3         256      6.25%
+//    63      24576       24576        1           0     11.45%
+//    64      27264       81920        3         128     10.00%
+//    65      28672       57344        2           0      4.91%
+//    66      32768       32768        1           0     12.50%
 
 const (
 	_MaxSmallSize   = 32768
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index f61f85e0fcb..ec5aa640222 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -23,6 +23,13 @@ type slice struct {
 	cap   int
 }
 
+// An notInHeapSlice is a slice backed by go:notinheap memory.
+type notInHeapSlice struct {
+	array *notInHeap
+	len   int
+	cap   int
+}
+
 // maxElems is a lookup table containing the maximum capacity for a slice.
 // The index is the size of the slice element.
 var maxElems = [...]uintptr{
@@ -85,7 +92,7 @@ func makeslice64(et *_type, len64, cap64 int64) slice {
 // The new slice's length is set to the requested capacity.
 func growslice(et *_type, old slice, cap int) slice {
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&et))
+		callerpc := getcallerpc()
 		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, funcPC(growslice))
 	}
 	if msanenabled {
@@ -109,12 +116,20 @@ func growslice(et *_type, old slice, cap int) slice {
 		if old.len < 1024 {
 			newcap = doublecap
 		} else {
-			for newcap < cap {
+			// Check 0 < newcap to detect overflow
+			// and prevent an infinite loop.
+			for 0 < newcap && newcap < cap {
 				newcap += newcap / 4
 			}
+			// Set newcap to the requested cap when
+			// the newcap calculation overflowed.
+			if newcap <= 0 {
+				newcap = cap
+			}
 		}
 	}
 
+	var overflow bool
 	var lenmem, newlenmem, capmem uintptr
 	const ptrSize = unsafe.Sizeof((*byte)(nil))
 	switch et.size {
@@ -122,20 +137,37 @@ func growslice(et *_type, old slice, cap int) slice {
 		lenmem = uintptr(old.len)
 		newlenmem = uintptr(cap)
 		capmem = roundupsize(uintptr(newcap))
+		overflow = uintptr(newcap) > _MaxMem
 		newcap = int(capmem)
 	case ptrSize:
 		lenmem = uintptr(old.len) * ptrSize
 		newlenmem = uintptr(cap) * ptrSize
 		capmem = roundupsize(uintptr(newcap) * ptrSize)
+		overflow = uintptr(newcap) > _MaxMem/ptrSize
 		newcap = int(capmem / ptrSize)
 	default:
 		lenmem = uintptr(old.len) * et.size
 		newlenmem = uintptr(cap) * et.size
 		capmem = roundupsize(uintptr(newcap) * et.size)
+		overflow = uintptr(newcap) > maxSliceCap(et.size)
 		newcap = int(capmem / et.size)
 	}
 
-	if cap < old.cap || uintptr(newcap) > maxSliceCap(et.size) {
+	// The check of overflow (uintptr(newcap) > maxSliceCap(et.size))
+	// in addition to capmem > _MaxMem is needed to prevent an overflow
+	// which can be used to trigger a segfault on 32bit architectures
+	// with this example program:
+	//
+	// type T [1<<27 + 1]int64
+	//
+	// var d T
+	// var s []T
+	//
+	// func main() {
+	//   s = append(s, d, d, d, d)
+	//   print(len(s), "\n")
+	// }
+	if cap < old.cap || overflow || capmem > _MaxMem {
 		panic(errorString("growslice: cap out of range"))
 	}
 
@@ -176,7 +208,7 @@ func slicecopy(to, fm slice, width uintptr) int {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&to))
+		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
 		racewriterangepc(to.array, uintptr(n*int(width)), callerpc, pc)
 		racereadrangepc(fm.array, uintptr(n*int(width)), callerpc, pc)
@@ -207,7 +239,7 @@ func slicestringcopy(to []byte, fm string) int {
 	}
 
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&to))
+		callerpc := getcallerpc()
 		pc := funcPC(slicestringcopy)
 		racewriterangepc(unsafe.Pointer(&to[0]), uintptr(n), callerpc, pc)
 	}
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index 7436ddfdf4b..e8df9a6b7c4 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -99,7 +99,7 @@ func slicebytetostring(buf *tmpBuf, b []byte) (str string) {
 	if raceenabled {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(l),
-			getcallerpc(unsafe.Pointer(&buf)),
+			getcallerpc(),
 			funcPC(slicebytetostring))
 	}
 	if msanenabled {
@@ -145,7 +145,7 @@ func slicebytetostringtmp(b []byte) string {
 	if raceenabled && len(b) > 0 {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(len(b)),
-			getcallerpc(unsafe.Pointer(&b)),
+			getcallerpc(),
 			funcPC(slicebytetostringtmp))
 	}
 	if msanenabled && len(b) > 0 {
@@ -194,7 +194,7 @@ func slicerunetostring(buf *tmpBuf, a []rune) string {
 	if raceenabled && len(a) > 0 {
 		racereadrangepc(unsafe.Pointer(&a[0]),
 			uintptr(len(a))*unsafe.Sizeof(a[0]),
-			getcallerpc(unsafe.Pointer(&buf)),
+			getcallerpc(),
 			funcPC(slicerunetostring))
 	}
 	if msanenabled && len(a) > 0 {
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 84fa1c79689..c454356b838 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -107,16 +107,21 @@ func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
 func memcmp(a, b unsafe.Pointer, size uintptr) int32
 
 // exported value for testing
-var hashLoad = loadFactor
+var hashLoad = float32(loadFactorNum) / float32(loadFactorDen)
 
 //go:nosplit
 func fastrand() uint32 {
 	mp := getg().m
-	fr := mp.fastrand
-	mx := uint32(int32(fr)>>31) & 0xa8888eef
-	fr = fr<<1 ^ mx
-	mp.fastrand = fr
-	return fr
+	// Implement xorshift64+: 2 32-bit xorshift sequences added together.
+	// Shift triplet [17,7,16] was calculated as indicated in Marsaglia's
+	// Xorshift paper: https://www.jstatsoft.org/article/view/v008i14/xorshift.pdf
+	// This generator passes the SmallCrush suite, part of TestU01 framework:
+	// http://simul.iro.umontreal.ca/testu01/tu01.html
+	s1, s0 := mp.fastrand[0], mp.fastrand[1]
+	s1 ^= s1 << 17
+	s1 = s1 ^ s0 ^ s1>>7 ^ s0>>16
+	mp.fastrand[0], mp.fastrand[1] = s0, s1
+	return s0 + s1
 }
 
 //go:nosplit
@@ -192,14 +197,16 @@ func publicationBarrier()
 
 // getcallerpc returns the program counter (PC) of its caller's caller.
 // getcallersp returns the stack pointer (SP) of its caller's caller.
-// For both, the argp must be a pointer to the caller's first function argument.
+// argp must be a pointer to the caller's first function argument.
 // The implementation may or may not use argp, depending on
-// the architecture.
+// the architecture. The implementation may be a compiler
+// intrinsic; there is not necessarily code implementing this
+// on every platform.
 //
 // For example:
 //
 //	func f(arg1, arg2, arg3 int) {
-//		pc := getcallerpc(unsafe.Pointer(&arg1))
+//		pc := getcallerpc()
 //		sp := getcallersp(unsafe.Pointer(&arg1))
 //	}
 //
@@ -219,7 +226,7 @@ func publicationBarrier()
 // immediately and can only be passed to nosplit functions.
 
 //go:noescape
-func getcallerpc(argp unsafe.Pointer) uintptr
+func getcallerpc() uintptr
 
 //go:noescape
 func getcallersp(argp unsafe.Pointer) uintptr
@@ -430,7 +437,7 @@ func setpagesize(s uintptr) {
 	}
 }
 
-// Temporary for gccgo until we port mgc.go.
+// Called by C code during library initialization.
 //go:linkname runtime_m0 runtime.runtime_m0
 func runtime_m0() *m {
 	return &m0
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index 490405d51fd..e7607722a64 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -23,3 +23,10 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32
 
 //go:noescape
 func open(name *byte, mode, perm int32) int32
+
+// exitThread terminates the current thread, writing *wait = 0 when
+// the stack is safe to reclaim.
+func exitThread(wait *uint32) {
+	// This is never used by gccgo.
+	throw("exitThread")
+}
diff --git a/libgo/go/runtime/testdata/testprog/gc.go b/libgo/go/runtime/testdata/testprog/gc.go
index 744b6108e2b..542451753b7 100644
--- a/libgo/go/runtime/testdata/testprog/gc.go
+++ b/libgo/go/runtime/testdata/testprog/gc.go
@@ -25,6 +25,7 @@ func GCSys() {
 	runtime.GC()
 	runtime.ReadMemStats(memstats)
 	sys := memstats.Sys
+	fmt.Printf("original sys: %#x\n", sys)
 
 	runtime.MemProfileRate = 0 // disable profiler
 
@@ -36,6 +37,8 @@ func GCSys() {
 	// Should only be using a few MB.
 	// We allocated 100 MB or (if not short) 1 GB.
 	runtime.ReadMemStats(memstats)
+	fmt.Printf("final sys: %#x\n", memstats.Sys)
+	fmt.Printf("%#v\n", *memstats)
 	if sys > memstats.Sys {
 		sys = 0
 	} else {
diff --git a/libgo/go/runtime/testdata/testprog/gettid.go b/libgo/go/runtime/testdata/testprog/gettid.go
new file mode 100644
index 00000000000..1b3e29ab08e
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/gettid.go
@@ -0,0 +1,29 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+)
+
+func gettid() int {
+	return syscall.Gettid()
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
+	if os.IsNotExist(err) {
+		return false, true
+	}
+	// Check if it's a zombie thread.
+	state := bytes.Fields(stat)[2]
+	return !(len(state) == 1 && state[0] == 'Z'), true
+}
diff --git a/libgo/go/runtime/testdata/testprog/gettid_none.go b/libgo/go/runtime/testdata/testprog/gettid_none.go
new file mode 100644
index 00000000000..036db87e10e
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/gettid_none.go
@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !linux
+
+package main
+
+func gettid() int {
+	return 0
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	return false, false
+}
diff --git a/libgo/go/runtime/testdata/testprog/lockosthread.go b/libgo/go/runtime/testdata/testprog/lockosthread.go
new file mode 100644
index 00000000000..88c0d12e4c1
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/lockosthread.go
@@ -0,0 +1,94 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"os"
+	"runtime"
+	"time"
+)
+
+var mainTID int
+
+func init() {
+	registerInit("LockOSThreadMain", func() {
+		// init is guaranteed to run on the main thread.
+		mainTID = gettid()
+	})
+	register("LockOSThreadMain", LockOSThreadMain)
+
+	registerInit("LockOSThreadAlt", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAlt", LockOSThreadAlt)
+}
+
+func LockOSThreadMain() {
+	// gettid only works on Linux, so on other platforms this just
+	// checks that the runtime doesn't do anything terrible.
+
+	// This requires GOMAXPROCS=1 from the beginning to reliably
+	// start a goroutine on the main thread.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// Because GOMAXPROCS=1, this *should* be on the main
+		// thread. Stay there.
+		runtime.LockOSThread()
+		if mainTID != 0 && gettid() != mainTID {
+			println("failed to start goroutine on main thread")
+			os.Exit(1)
+		}
+		// Exit with the thread locked, which should exit the
+		// main thread.
+		ready <- true
+	}()
+	<-ready
+	time.Sleep(1 * time.Millisecond)
+	// Check that this goroutine is still running on a different
+	// thread.
+	if mainTID != 0 && gettid() == mainTID {
+		println("goroutine migrated to locked thread")
+		os.Exit(1)
+	}
+	println("OK")
+}
+
+func LockOSThreadAlt() {
+	// This is running locked to the main OS thread.
+
+	var subTID int
+	ready := make(chan bool, 1)
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+		subTID = gettid()
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+	runtime.UnlockOSThread()
+	for i := 0; i < 100; i++ {
+		time.Sleep(1 * time.Millisecond)
+		// Check that this goroutine is running on a different thread.
+		if subTID != 0 && gettid() == subTID {
+			println("locked thread reused")
+			os.Exit(1)
+		}
+		exists, supported := tidExists(subTID)
+		if !supported || !exists {
+			goto ok
+		}
+	}
+	println("sub thread", subTID, "still running")
+	return
+ok:
+	println("OK")
+}
diff --git a/libgo/go/runtime/testdata/testprog/syscall_windows.go b/libgo/go/runtime/testdata/testprog/syscall_windows.go
index 6e6782e987a..b4b66441b83 100644
--- a/libgo/go/runtime/testdata/testprog/syscall_windows.go
+++ b/libgo/go/runtime/testdata/testprog/syscall_windows.go
@@ -4,11 +4,18 @@
 
 package main
 
-import "syscall"
+import (
+	"internal/syscall/windows"
+	"runtime"
+	"sync"
+	"syscall"
+	"unsafe"
+)
 
 func init() {
 	register("RaiseException", RaiseException)
 	register("ZeroDivisionException", ZeroDivisionException)
+	register("StackMemory", StackMemory)
 }
 
 func RaiseException() {
@@ -25,3 +32,39 @@ func ZeroDivisionException() {
 	z := x / y
 	println(z)
 }
+
+func getPagefileUsage() (uintptr, error) {
+	p, err := syscall.GetCurrentProcess()
+	if err != nil {
+		return 0, err
+	}
+	var m windows.PROCESS_MEMORY_COUNTERS
+	err = windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err != nil {
+		return 0, err
+	}
+	return m.PagefileUsage, nil
+}
+
+func StackMemory() {
+	mem1, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	const threadCount = 100
+	var wg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		wg.Add(1)
+		go func() {
+			runtime.LockOSThread()
+			wg.Done()
+			select {}
+		}()
+	}
+	wg.Wait()
+	mem2, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	print((mem2 - mem1) / threadCount)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/callback.go b/libgo/go/runtime/testdata/testprogcgo/callback.go
index a49fc19b284..2f7568c2c4e 100644
--- a/libgo/go/runtime/testdata/testprogcgo/callback.go
+++ b/libgo/go/runtime/testdata/testprogcgo/callback.go
@@ -34,6 +34,7 @@ import "C"
 
 import (
 	"fmt"
+	"os"
 	"runtime"
 )
 
@@ -68,7 +69,10 @@ func grow1(x, sum *int) int {
 }
 
 func CgoCallbackGC() {
-	const P = 100
+	P := 100
+	if os.Getenv("RUNTIME_TESTING_SHORT") != "" {
+		P = 10
+	}
 	done := make(chan bool)
 	// allocate a bunch of stack frames and spray them with pointers
 	for i := 0; i < P; i++ {
diff --git a/libgo/go/runtime/testdata/testprogcgo/catchpanic.go b/libgo/go/runtime/testdata/testprogcgo/catchpanic.go
new file mode 100644
index 00000000000..55a606d1bc8
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/catchpanic.go
@@ -0,0 +1,46 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void abrthandler(int signum) {
+	if (signum == SIGABRT) {
+		exit(0);  // success
+	}
+}
+
+void registerAbortHandler() {
+	struct sigaction act;
+	memset(&act, 0, sizeof act);
+	act.sa_handler = abrthandler;
+	sigaction(SIGABRT, &act, NULL);
+}
+
+static void __attribute__ ((constructor)) sigsetup(void) {
+	if (getenv("CGOCATCHPANIC_EARLY_HANDLER") == NULL)
+		return;
+	registerAbortHandler();
+}
+*/
+import "C"
+import "os"
+
+func init() {
+	register("CgoCatchPanic", CgoCatchPanic)
+}
+
+// Test that the SIGABRT raised by panic can be caught by an early signal handler.
+func CgoCatchPanic() {
+	if _, ok := os.LookupEnv("CGOCATCHPANIC_EARLY_HANDLER"); !ok {
+		C.registerAbortHandler()
+	}
+	panic("catch me")
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/cgo.go b/libgo/go/runtime/testdata/testprogcgo/cgo.go
index 209524a24db..a587db385b3 100644
--- a/libgo/go/runtime/testdata/testprogcgo/cgo.go
+++ b/libgo/go/runtime/testdata/testprogcgo/cgo.go
@@ -52,7 +52,11 @@ func CgoSignalDeadlock() {
 	time.Sleep(time.Millisecond)
 	start := time.Now()
 	var times []time.Duration
-	for i := 0; i < 64; i++ {
+	n := 64
+	if os.Getenv("RUNTIME_TEST_SHORT") != "" {
+		n = 16
+	}
+	for i := 0; i < n; i++ {
 		go func() {
 			runtime.LockOSThread()
 			select {}
diff --git a/libgo/go/runtime/testdata/testprogcgo/lockosthread.c b/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
new file mode 100644
index 00000000000..b10cc4f3b92
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
@@ -0,0 +1,13 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+#include <stdint.h>
+
+uint32_t threadExited;
+
+void setExited(void *x) {
+	__sync_fetch_and_add(&threadExited, 1);
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/lockosthread.go b/libgo/go/runtime/testdata/testprogcgo/lockosthread.go
new file mode 100644
index 00000000000..36423d9eb0c
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/lockosthread.go
@@ -0,0 +1,111 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+import (
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
+	"unsafe"
+)
+
+/*
+#include <pthread.h>
+#include <stdint.h>
+
+extern uint32_t threadExited;
+
+void setExited(void *x);
+*/
+import "C"
+
+var mainThread C.pthread_t
+
+func init() {
+	registerInit("LockOSThreadMain", func() {
+		// init is guaranteed to run on the main thread.
+		mainThread = C.pthread_self()
+	})
+	register("LockOSThreadMain", LockOSThreadMain)
+
+	registerInit("LockOSThreadAlt", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAlt", LockOSThreadAlt)
+}
+
+func LockOSThreadMain() {
+	// This requires GOMAXPROCS=1 from the beginning to reliably
+	// start a goroutine on the main thread.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// Because GOMAXPROCS=1, this *should* be on the main
+		// thread. Stay there.
+		runtime.LockOSThread()
+		self := C.pthread_self()
+		if C.pthread_equal(mainThread, self) == 0 {
+			println("failed to start goroutine on main thread")
+			os.Exit(1)
+		}
+		// Exit with the thread locked, which should exit the
+		// main thread.
+		ready <- true
+	}()
+	<-ready
+	time.Sleep(1 * time.Millisecond)
+	// Check that this goroutine is still running on a different
+	// thread.
+	self := C.pthread_self()
+	if C.pthread_equal(mainThread, self) != 0 {
+		println("goroutine migrated to locked thread")
+		os.Exit(1)
+	}
+	println("OK")
+}
+
+func LockOSThreadAlt() {
+	// This is running locked to the main OS thread.
+
+	var subThread C.pthread_t
+	ready := make(chan bool, 1)
+	C.threadExited = 0
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+		subThread = C.pthread_self()
+		// Register a pthread destructor so we can tell this
+		// thread has exited.
+		var key C.pthread_key_t
+		C.pthread_key_create(&key, (*[0]byte)(unsafe.Pointer(C.setExited)))
+		C.pthread_setspecific(key, unsafe.Pointer(new(int)))
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+	for i := 0; i < 100; i++ {
+		time.Sleep(1 * time.Millisecond)
+		// Check that this goroutine is running on a different thread.
+		self := C.pthread_self()
+		if C.pthread_equal(subThread, self) != 0 {
+			println("locked thread reused")
+			os.Exit(1)
+		}
+		if atomic.LoadUint32((*uint32)(&C.threadExited)) != 0 {
+			println("OK")
+			return
+		}
+	}
+	println("sub thread still running")
+	os.Exit(1)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/sigstack.go b/libgo/go/runtime/testdata/testprogcgo/sigstack.go
new file mode 100644
index 00000000000..e30a5592dcb
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/sigstack.go
@@ -0,0 +1,95 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+// Test handling of Go-allocated signal stacks when calling from
+// C-created threads with and without signal stacks. (See issue
+// #22930.)
+
+package main
+
+/*
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#ifndef MAP_STACK
+#define MAP_STACK 0
+#endif
+
+extern void SigStackCallback();
+
+static void* WithSigStack(void* arg __attribute__((unused))) {
+	// Set up an alternate system stack.
+	void* base = mmap(0, SIGSTKSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_STACK, -1, 0);
+	if (base == MAP_FAILED) {
+		perror("mmap failed");
+		abort();
+	}
+	stack_t st = {}, ost = {};
+	st.ss_sp = (char*)base;
+	st.ss_flags = 0;
+	st.ss_size = SIGSTKSZ;
+	if (sigaltstack(&st, &ost) < 0) {
+		perror("sigaltstack failed");
+		abort();
+	}
+
+	// Call Go.
+	SigStackCallback();
+
+	// Disable signal stack and protect it so we can detect reuse.
+	if (ost.ss_flags & SS_DISABLE) {
+		// Darwin libsystem has a bug where it checks ss_size
+		// even if SS_DISABLE is set. (The kernel gets it right.)
+		ost.ss_size = SIGSTKSZ;
+	}
+	if (sigaltstack(&ost, NULL) < 0) {
+		perror("sigaltstack restore failed");
+		abort();
+	}
+	mprotect(base, SIGSTKSZ, PROT_NONE);
+	return NULL;
+}
+
+static void* WithoutSigStack(void* arg __attribute__((unused))) {
+	SigStackCallback();
+	return NULL;
+}
+
+static void DoThread(int sigstack) {
+	pthread_t tid;
+	if (sigstack) {
+		pthread_create(&tid, NULL, WithSigStack, NULL);
+	} else {
+		pthread_create(&tid, NULL, WithoutSigStack, NULL);
+	}
+	pthread_join(tid, NULL);
+}
+*/
+import "C"
+
+func init() {
+	register("SigStack", SigStack)
+}
+
+func SigStack() {
+	C.DoThread(0)
+	C.DoThread(1)
+	C.DoThread(0)
+	C.DoThread(1)
+	println("OK")
+}
+
+var BadPtr *int
+
+//export SigStackCallback
+func SigStackCallback() {
+	// Cause the Go signal handler to run.
+	defer func() { recover() }()
+	*BadPtr = 42
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/stack_windows.go b/libgo/go/runtime/testdata/testprogcgo/stack_windows.go
new file mode 100644
index 00000000000..846297a960c
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/stack_windows.go
@@ -0,0 +1,54 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "C"
+import (
+	"internal/syscall/windows"
+	"runtime"
+	"sync"
+	"syscall"
+	"unsafe"
+)
+
+func init() {
+	register("StackMemory", StackMemory)
+}
+
+func getPagefileUsage() (uintptr, error) {
+	p, err := syscall.GetCurrentProcess()
+	if err != nil {
+		return 0, err
+	}
+	var m windows.PROCESS_MEMORY_COUNTERS
+	err = windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err != nil {
+		return 0, err
+	}
+	return m.PagefileUsage, nil
+}
+
+func StackMemory() {
+	mem1, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	const threadCount = 100
+	var wg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		wg.Add(1)
+		go func() {
+			runtime.LockOSThread()
+			wg.Done()
+			select {}
+		}()
+	}
+	wg.Wait()
+	mem2, err := getPagefileUsage()
+	if err != nil {
+		panic(err)
+	}
+	print((mem2 - mem1) / threadCount)
+}
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index f204830a6f7..93181fde600 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -6,14 +6,18 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Package time knows the layout of this structure.
 // If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
 // For GOOS=nacl, package syscall knows the layout of this structure.
 // If this struct changes, adjust ../syscall/net_nacl.go:/runtimeTimer.
 type timer struct {
-	i int // heap index
+	tb *timersBucket // the bucket the timer lives in
+	i  int           // heap index
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
@@ -25,7 +29,37 @@ type timer struct {
 	seq    uintptr
 }
 
-var timers struct {
+// timersLen is the length of timers array.
+//
+// Ideally, this would be set to GOMAXPROCS, but that would require
+// dynamic reallocation
+//
+// The current value is a compromise between memory usage and performance
+// that should cover the majority of GOMAXPROCS values used in the wild.
+const timersLen = 64
+
+// timers contains "per-P" timer heaps.
+//
+// Timers are queued into timersBucket associated with the current P,
+// so each P may work with its own timers independently of other P instances.
+//
+// Each timersBucket may be associated with multiple P
+// if GOMAXPROCS > timersLen.
+var timers [timersLen]struct {
+	timersBucket
+
+	// The padding should eliminate false sharing
+	// between timersBucket values.
+	pad [sys.CacheLineSize - unsafe.Sizeof(timersBucket{})%sys.CacheLineSize]byte
+}
+
+func (t *timer) assignBucket() *timersBucket {
+	id := uint8(getg().m.p.ptr().id) % timersLen
+	t.tb = &timers[id].timersBucket
+	return t.tb
+}
+
+type timersBucket struct {
 	lock         mutex
 	gp           *g
 	created      bool
@@ -51,18 +85,20 @@ func timeSleep(ns int64) {
 		return
 	}
 
-	t := getg().timer
+	gp := getg()
+	t := gp.timer
 	if t == nil {
 		t = new(timer)
-		getg().timer = t
+		gp.timer = t
 	}
 	*t = timer{}
 	t.when = nanotime() + ns
 	t.f = goroutineReady
-	t.arg = getg()
-	lock(&timers.lock)
-	addtimerLocked(t)
-	goparkunlock(&timers.lock, "sleep", traceEvGoSleep, 2)
+	t.arg = gp
+	tb := t.assignBucket()
+	lock(&tb.lock)
+	tb.addtimerLocked(t)
+	goparkunlock(&tb.lock, "sleep", traceEvGoSleep, 2)
 }
 
 // startTimer adds t to the timer heap.
@@ -89,90 +125,98 @@ func goroutineReady(arg interface{}, seq uintptr) {
 }
 
 func addtimer(t *timer) {
-	lock(&timers.lock)
-	addtimerLocked(t)
-	unlock(&timers.lock)
+	tb := t.assignBucket()
+	lock(&tb.lock)
+	tb.addtimerLocked(t)
+	unlock(&tb.lock)
 }
 
 // Add a timer to the heap and start or kick timerproc if the new timer is
 // earlier than any of the others.
 // Timers are locked.
-func addtimerLocked(t *timer) {
+func (tb *timersBucket) addtimerLocked(t *timer) {
 	// when must never be negative; otherwise timerproc will overflow
 	// during its delta calculation and never expire other runtime timers.
 	if t.when < 0 {
 		t.when = 1<<63 - 1
 	}
-	t.i = len(timers.t)
-	timers.t = append(timers.t, t)
-	siftupTimer(t.i)
+	t.i = len(tb.t)
+	tb.t = append(tb.t, t)
+	siftupTimer(tb.t, t.i)
 	if t.i == 0 {
 		// siftup moved to top: new earliest deadline.
-		if timers.sleeping {
-			timers.sleeping = false
-			notewakeup(&timers.waitnote)
+		if tb.sleeping {
+			tb.sleeping = false
+			notewakeup(&tb.waitnote)
 		}
-		if timers.rescheduling {
-			timers.rescheduling = false
-			goready(timers.gp, 0)
+		if tb.rescheduling {
+			tb.rescheduling = false
+			goready(tb.gp, 0)
 		}
 	}
-	if !timers.created {
-		timers.created = true
+	if !tb.created {
+		tb.created = true
 		expectSystemGoroutine()
-		go timerproc()
+		go timerproc(tb)
 	}
 }
 
 // Delete timer t from the heap.
 // Do not need to update the timerproc: if it wakes up early, no big deal.
 func deltimer(t *timer) bool {
-	// Dereference t so that any panic happens before the lock is held.
-	// Discard result, because t might be moving in the heap.
-	_ = t.i
+	if t.tb == nil {
+		// t.tb can be nil if the user created a timer
+		// directly, without invoking startTimer e.g
+		//    time.Ticker{C: c}
+		// In this case, return early without any deletion.
+		// See Issue 21874.
+		return false
+	}
 
-	lock(&timers.lock)
+	tb := t.tb
+
+	lock(&tb.lock)
 	// t may not be registered anymore and may have
 	// a bogus i (typically 0, if generated by Go).
 	// Verify it before proceeding.
 	i := t.i
-	last := len(timers.t) - 1
-	if i < 0 || i > last || timers.t[i] != t {
-		unlock(&timers.lock)
+	last := len(tb.t) - 1
+	if i < 0 || i > last || tb.t[i] != t {
+		unlock(&tb.lock)
 		return false
 	}
 	if i != last {
-		timers.t[i] = timers.t[last]
-		timers.t[i].i = i
+		tb.t[i] = tb.t[last]
+		tb.t[i].i = i
 	}
-	timers.t[last] = nil
-	timers.t = timers.t[:last]
+	tb.t[last] = nil
+	tb.t = tb.t[:last]
 	if i != last {
-		siftupTimer(i)
-		siftdownTimer(i)
+		siftupTimer(tb.t, i)
+		siftdownTimer(tb.t, i)
 	}
-	unlock(&timers.lock)
+	unlock(&tb.lock)
 	return true
 }
 
 // Timerproc runs the time-driven events.
-// It sleeps until the next event in the timers heap.
+// It sleeps until the next event in the tb heap.
 // If addtimer inserts a new earlier event, it wakes timerproc early.
-func timerproc() {
+func timerproc(tb *timersBucket) {
 	setSystemGoroutine()
 
-	timers.gp = getg()
+	tb.gp = getg()
 	for {
-		lock(&timers.lock)
-		timers.sleeping = false
+		lock(&tb.lock)
+		tb.sleeping = false
 		now := nanotime()
 		delta := int64(-1)
 		for {
-			if len(timers.t) == 0 {
+			if len(tb.t) == 0 {
 				delta = -1
 				break
 			}
-			t := timers.t[0]
+			t := tb.t[0]
 			delta = t.when - now
 			if delta > 0 {
 				break
@@ -180,43 +224,43 @@ func timerproc() {
 			if t.period > 0 {
 				// leave in heap but adjust next time to fire
 				t.when += t.period * (1 + -delta/t.period)
-				siftdownTimer(0)
+				siftdownTimer(tb.t, 0)
 			} else {
 				// remove from heap
-				last := len(timers.t) - 1
+				last := len(tb.t) - 1
 				if last > 0 {
-					timers.t[0] = timers.t[last]
-					timers.t[0].i = 0
+					tb.t[0] = tb.t[last]
+					tb.t[0].i = 0
 				}
-				timers.t[last] = nil
-				timers.t = timers.t[:last]
+				tb.t[last] = nil
+				tb.t = tb.t[:last]
 				if last > 0 {
-					siftdownTimer(0)
+					siftdownTimer(tb.t, 0)
 				}
 				t.i = -1 // mark as removed
 			}
 			f := t.f
 			arg := t.arg
 			seq := t.seq
-			unlock(&timers.lock)
+			unlock(&tb.lock)
 			if raceenabled {
 				raceacquire(unsafe.Pointer(t))
 			}
 			f(arg, seq)
-			lock(&timers.lock)
+			lock(&tb.lock)
 		}
 		if delta < 0 || faketime > 0 {
 			// No timers left - put goroutine to sleep.
-			timers.rescheduling = true
-			goparkunlock(&timers.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
+			tb.rescheduling = true
+			goparkunlock(&tb.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
 			continue
 		}
 		// At least one timer pending. Sleep until then.
-		timers.sleeping = true
-		timers.sleepUntil = now + delta
-		noteclear(&timers.waitnote)
-		unlock(&timers.lock)
-		notetsleepg(&timers.waitnote, delta)
+		tb.sleeping = true
+		tb.sleepUntil = now + delta
+		noteclear(&tb.waitnote)
+		unlock(&tb.lock)
+		notetsleepg(&tb.waitnote, delta)
 	}
 }
 
@@ -225,28 +269,67 @@ func timejump() *g {
 		return nil
 	}
 
-	lock(&timers.lock)
-	if !timers.created || len(timers.t) == 0 {
-		unlock(&timers.lock)
+	for i := range timers {
+		lock(&timers[i].lock)
+	}
+	gp := timejumpLocked()
+	for i := range timers {
+		unlock(&timers[i].lock)
+	}
+
+	return gp
+}
+
+func timejumpLocked() *g {
+	// Determine a timer bucket with minimum when.
+	var minT *timer
+	for i := range timers {
+		tb := &timers[i]
+		if !tb.created || len(tb.t) == 0 {
+			continue
+		}
+		t := tb.t[0]
+		if minT == nil || t.when < minT.when {
+			minT = t
+		}
+	}
+	if minT == nil || minT.when <= faketime {
+		return nil
+	}
+
+	faketime = minT.when
+	tb := minT.tb
+	if !tb.rescheduling {
 		return nil
 	}
+	tb.rescheduling = false
+	return tb.gp
+}
+
+func timeSleepUntil() int64 {
+	next := int64(1<<63 - 1)
 
-	var gp *g
-	if faketime < timers.t[0].when {
-		faketime = timers.t[0].when
-		if timers.rescheduling {
-			timers.rescheduling = false
-			gp = timers.gp
+	// Determine minimum sleepUntil across all the timer buckets.
+	//
+	// The function can not return a precise answer,
+	// as another timer may pop in as soon as timers have been unlocked.
+	// So lock the timers one by one instead of all at once.
+	for i := range timers {
+		tb := &timers[i]
+
+		lock(&tb.lock)
+		if tb.sleeping && tb.sleepUntil < next {
+			next = tb.sleepUntil
 		}
+		unlock(&tb.lock)
 	}
-	unlock(&timers.lock)
-	return gp
+
+	return next
 }
 
 // Heap maintenance algorithms.
 
-func siftupTimer(i int) {
-	t := timers.t
+func siftupTimer(t []*timer, i int) {
 	when := t[i].when
 	tmp := t[i]
 	for i > 0 {
@@ -256,14 +339,15 @@ func siftupTimer(i int) {
 		}
 		t[i] = t[p]
 		t[i].i = i
-		t[p] = tmp
-		t[p].i = p
 		i = p
 	}
+	if tmp != t[i] {
+		t[i] = tmp
+		t[i].i = i
+	}
 }
 
-func siftdownTimer(i int) {
-	t := timers.t
+func siftdownTimer(t []*timer, i int) {
 	n := len(t)
 	when := t[i].when
 	tmp := t[i]
@@ -294,10 +378,12 @@ func siftdownTimer(i int) {
 		}
 		t[i] = t[c]
 		t[i].i = i
-		t[c] = tmp
-		t[c].i = c
 		i = c
 	}
+	if tmp != t[i] {
+		t[i] = tmp
+		t[i].i = i
+	}
 }
 
 // Entry points for net, time to call nanotime.
@@ -312,4 +398,10 @@ func time_runtimeNano() int64 {
 	return nanotime()
 }
 
-var startNano int64 = nanotime()
+// Monotonic times are reported as offsets from startNano.
+// We initialize startNano to nanotime() - 1 so that on systems where
+// monotonic time resolution is fairly low (e.g. Windows 2008
+// which appears to have a default resolution of 15ms),
+// we avoid ever reporting a nanotime of 0.
+// (Callers may want to use 0 as "time not set".)
+var startNano int64 = nanotime() - 1
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index af9313be37a..8427e76c5a3 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -28,8 +28,8 @@ const (
 	traceEvProcStop          = 6  // stop of P [timestamp]
 	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
 	traceEvGCDone            = 8  // GC done [timestamp]
-	traceEvGCScanStart       = 9  // GC mark termination start [timestamp]
-	traceEvGCScanDone        = 10 // GC mark termination done [timestamp]
+	traceEvGCSTWStart        = 9  // GC STW start [timestamp, kind]
+	traceEvGCSTWDone         = 10 // GC STW done [timestamp]
 	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
 	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
 	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
@@ -235,21 +235,21 @@ func StartTrace() error {
 	trace.timeStart = nanotime()
 	trace.headerWritten = false
 	trace.footerWritten = false
-	trace.strings = make(map[string]uint64)
+
+	// string to id mapping
+	//  0 : reserved for an empty string
+	//  remaining: other strings registered by traceString
 	trace.stringSeq = 0
+	trace.strings = make(map[string]uint64)
+
 	trace.seqGC = 0
 	_g_.m.startingtrace = false
 	trace.enabled = true
 
 	// Register runtime goroutine labels.
 	_, pid, bufp := traceAcquireBuffer()
-	buf := (*bufp).ptr()
-	if buf == nil {
-		buf = traceFlush(0).ptr()
-		(*bufp).set(buf)
-	}
 	for i, label := range gcMarkWorkerModeStrings[:] {
-		trace.markWorkerLabels[i], buf = traceString(buf, label)
+		trace.markWorkerLabels[i], bufp = traceString(bufp, pid, label)
 	}
 	traceReleaseBuffer(pid)
 
@@ -277,10 +277,9 @@ func StopTrace() {
 
 	traceGoSched()
 
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	// Loop over all allocated Ps because dead Ps may still have
+	// trace buffers.
+	for _, p := range allp[:cap(allp)] {
 		buf := p.tracebuf
 		if buf != 0 {
 			traceFullQueue(buf)
@@ -320,10 +319,7 @@ func StopTrace() {
 
 	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
 	lock(&trace.lock)
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
+	for _, p := range allp[:cap(allp)] {
 		if p.tracebuf != 0 {
 			throw("trace: non-empty trace buffer in proc")
 		}
@@ -382,7 +378,7 @@ func ReadTrace() []byte {
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.9 trace\x00\x00\x00\x00")
+		return []byte("go 1.10 trace\x00\x00\x00")
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
@@ -408,9 +404,12 @@ func ReadTrace() []byte {
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		if timers.gp != nil {
-			data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
-			data = traceAppend(data, uint64(timers.gp.goid))
+		for i := range timers {
+			tb := &timers[i]
+			if tb.gp != nil {
+				data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
+				data = traceAppend(data, uint64(tb.gp.goid))
+			}
 		}
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
@@ -514,18 +513,12 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 	buf := (*bufp).ptr()
 	const maxSize = 2 + 5*traceBytesPerNumber // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
-		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
 		(*bufp).set(buf)
 	}
 
 	ticks := uint64(cputicks()) / traceTickDiv
 	tickDiff := ticks - buf.lastTicks
-	if buf.pos == 0 {
-		buf.byte(traceEvBatch | 1<<traceArgCountShift)
-		buf.varint(uint64(pid))
-		buf.varint(ticks)
-		tickDiff = 0
-	}
 	buf.lastTicks = ticks
 	narg := byte(len(args))
 	if skip >= 0 {
@@ -602,7 +595,7 @@ func traceReleaseBuffer(pid int32) {
 }
 
 // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
-func traceFlush(buf traceBufPtr) traceBufPtr {
+func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
 	owner := trace.lockOwner
 	dolock := owner == nil || owner != getg().m.curg
 	if dolock {
@@ -623,34 +616,51 @@ func traceFlush(buf traceBufPtr) traceBufPtr {
 	bufp := buf.ptr()
 	bufp.link.set(nil)
 	bufp.pos = 0
-	bufp.lastTicks = 0
+
+	// initialize the buffer for a new batch
+	ticks := uint64(cputicks()) / traceTickDiv
+	bufp.lastTicks = ticks
+	bufp.byte(traceEvBatch | 1<<traceArgCountShift)
+	bufp.varint(uint64(pid))
+	bufp.varint(ticks)
+
 	if dolock {
 		unlock(&trace.lock)
 	}
 	return buf
 }
 
-func traceString(buf *traceBuf, s string) (uint64, *traceBuf) {
+// traceString adds a string to the trace.strings and returns the id.
+func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr) {
 	if s == "" {
-		return 0, buf
+		return 0, bufp
 	}
 	if id, ok := trace.strings[s]; ok {
-		return id, buf
+		return id, bufp
 	}
 
 	trace.stringSeq++
 	id := trace.stringSeq
 	trace.strings[s] = id
 
+	// memory allocation in above may trigger tracing and
+	// cause *bufp changes. Following code now works with *bufp,
+	// so there must be no memory allocation or any activities
+	// that causes tracing after this point.
+
+	buf := (*bufp).ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
-	if len(buf.arr)-buf.pos < size {
-		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+	if buf == nil || len(buf.arr)-buf.pos < size {
+		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		(*bufp).set(buf)
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
 	buf.varint(uint64(len(s)))
 	buf.pos += copy(buf.arr[buf.pos:], s)
-	return id, buf
+
+	(*bufp).set(buf)
+	return id, bufp
 }
 
 // traceAppend appends v to buf in little-endian-base-128 encoding.
@@ -772,7 +782,7 @@ func (tab *traceStackTable) newStack(n int) *traceStack {
 // releases all memory and resets state.
 func (tab *traceStackTable) dump() {
 	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
-	buf := traceFlush(0).ptr()
+	bufp := traceFlush(0, 0)
 	for _, stk := range tab.tab {
 		stk := stk.ptr()
 		for ; stk != nil; stk = stk.link.ptr() {
@@ -782,7 +792,7 @@ func (tab *traceStackTable) dump() {
 			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
 			for _, f := range frames {
 				var frame traceFrame
-				frame, buf = traceFrameForPC(buf, f)
+				frame, bufp = traceFrameForPC(bufp, 0, f)
 				tmpbuf = traceAppend(tmpbuf, uint64(f.pc))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
@@ -790,9 +800,10 @@ func (tab *traceStackTable) dump() {
 			}
 			// Now copy to the buffer.
 			size := 1 + traceBytesPerNumber + len(tmpbuf)
-			if len(buf.arr)-buf.pos < size {
-				buf = traceFlush(traceBufPtrOf(buf)).ptr()
+			if buf := bufp.ptr(); len(buf.arr)-buf.pos < size {
+				bufp = traceFlush(bufp, 0)
 			}
+			buf := bufp.ptr()
 			buf.byte(traceEvStack | 3<<traceArgCountShift)
 			buf.varint(uint64(len(tmpbuf)))
 			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
@@ -800,7 +811,7 @@ func (tab *traceStackTable) dump() {
 	}
 
 	lock(&trace.lock)
-	traceFullQueue(traceBufPtrOf(buf))
+	traceFullQueue(bufp)
 	unlock(&trace.lock)
 
 	tab.mem.drop()
@@ -813,7 +824,10 @@ type traceFrame struct {
 	line   uint64
 }
 
-func traceFrameForPC(buf *traceBuf, f location) (traceFrame, *traceBuf) {
+// traceFrameForPC records the frame information.
+// It may allocate memory.
+func traceFrameForPC(buf traceBufPtr, pid int32, f location) (traceFrame, traceBufPtr) {
+	bufp := &buf
 	var frame traceFrame
 
 	fn := f.function
@@ -821,14 +835,14 @@ func traceFrameForPC(buf *traceBuf, f location) (traceFrame, *traceBuf) {
 	if len(fn) > maxLen {
 		fn = fn[len(fn)-maxLen:]
 	}
-	frame.funcID, buf = traceString(buf, fn)
+	frame.funcID, bufp = traceString(bufp, pid, fn)
 	frame.line = uint64(f.lineno)
 	file := f.filename
 	if len(file) > maxLen {
 		file = file[len(file)-maxLen:]
 	}
-	frame.fileID, buf = traceString(buf, file)
-	return frame, buf
+	frame.fileID, bufp = traceString(bufp, pid, file)
+	return frame, (*bufp)
 }
 
 // traceAlloc is a non-thread-safe region allocator.
@@ -917,12 +931,12 @@ func traceGCDone() {
 	traceEvent(traceEvGCDone, -1)
 }
 
-func traceGCScanStart() {
-	traceEvent(traceEvGCScanStart, -1)
+func traceGCSTWStart(kind int) {
+	traceEvent(traceEvGCSTWStart, -1, uint64(kind))
 }
 
-func traceGCScanDone() {
-	traceEvent(traceEvGCScanDone, -1)
+func traceGCSTWDone() {
+	traceEvent(traceEvGCSTWDone, -1)
 }
 
 // traceGCSweepStart prepares to trace a sweep loop. This does not
diff --git a/libgo/go/runtime/trace/example_test.go b/libgo/go/runtime/trace/example_test.go
new file mode 100644
index 00000000000..8e0ee5a1a3f
--- /dev/null
+++ b/libgo/go/runtime/trace/example_test.go
@@ -0,0 +1,41 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package trace_test
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"runtime/trace"
+)
+
+// Example demonstrates the use of the trace package to trace
+// the execution of a Go program. The trace output will be
+// written to the file trace.out
+func Example() {
+	f, err := os.Create("trace.out")
+	if err != nil {
+		log.Fatalf("failed to create trace output file: %v", err)
+	}
+	defer func() {
+		if err := f.Close(); err != nil {
+			log.Fatalf("failed to close trace file: %v", err)
+		}
+	}()
+
+	if err := trace.Start(f); err != nil {
+		log.Fatalf("failed to start trace: %v", err)
+	}
+	defer trace.Stop()
+
+	// your program here
+	RunMyProgram()
+}
+
+func RunMyProgram() {
+	fmt.Printf("this function will be traced")
+}
diff --git a/libgo/go/runtime/trace/trace.go b/libgo/go/runtime/trace/trace.go
index 7cbb8a6e82c..439f998c03a 100644
--- a/libgo/go/runtime/trace/trace.go
+++ b/libgo/go/runtime/trace/trace.go
@@ -2,13 +2,36 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Go execution tracer.
-// The tracer captures a wide range of execution events like goroutine
-// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
-// changes of heap size, processor start/stop, etc and writes them to an io.Writer
-// in a compact form. A precise nanosecond-precision timestamp and a stack
-// trace is captured for most events. A trace can be analyzed later with
-// 'go tool trace' command.
+// Package trace contains facilities for programs to generate trace
+// for Go execution tracer.
+//
+// The execution trace captures a wide range of execution events such as
+// goroutine creation/blocking/unblocking, syscall enter/exit/block,
+// GC-related events, changes of heap size, processor start/stop, etc.
+// A precise nanosecond-precision timestamp and a stack trace is
+// captured for most events. The generated trace can be interpreted
+// using `go tool trace`.
+//
+// Tracing a Go program
+//
+// Support for tracing tests and benchmarks built with the standard
+// testing package is built into `go test`. For example, the following
+// command runs the test in the current directory and writes the trace
+// file (trace.out).
+//
+//    go test -trace=test.out
+//
+// This runtime/trace package provides APIs to add equivalent tracing
+// support to a standalone program. See the Example that demonstrates
+// how to use this API to enable tracing.
+//
+// There is also a standard HTTP interface to profiling data. Adding the
+// following line will install handlers under the /debug/pprof/trace URL
+// to download live profiles:
+//
+//     import _ "net/http/pprof"
+//
+// See the net/http/pprof package for more details.
 package trace
 
 import (
diff --git a/libgo/go/runtime/trace/trace_test.go b/libgo/go/runtime/trace/trace_test.go
index c5f64fcf4cf..5fa5b82f8e2 100644
--- a/libgo/go/runtime/trace/trace_test.go
+++ b/libgo/go/runtime/trace/trace_test.go
@@ -7,6 +7,7 @@ package trace_test
 import (
 	"bytes"
 	"flag"
+	"internal/race"
 	"internal/trace"
 	"io"
 	"io/ioutil"
@@ -14,6 +15,7 @@ import (
 	"os"
 	"runtime"
 	. "runtime/trace"
+	"strconv"
 	"sync"
 	"testing"
 	"time"
@@ -23,6 +25,61 @@ var (
 	saveTraces = flag.Bool("savetraces", false, "save traces collected by tests")
 )
 
+// TestEventBatch tests Flush calls that happen during Start
+// don't produce corrupted traces.
+func TestEventBatch(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skipping in race mode")
+	}
+	if testing.Short() {
+		t.Skip("skipping in short mode")
+	}
+	// During Start, bunch of records are written to reflect the current
+	// snapshot of the program, including state of each goroutines.
+	// And some string constants are written to the trace to aid trace
+	// parsing. This test checks Flush of the buffer occurred during
+	// this process doesn't cause corrupted traces.
+	// When a Flush is called during Start is complicated
+	// so we test with a range of number of goroutines hoping that one
+	// of them triggers Flush.
+	// This range was chosen to fill up a ~64KB buffer with traceEvGoCreate
+	// and traceEvGoWaiting events (12~13bytes per goroutine).
+	for g := 4950; g < 5050; g++ {
+		n := g
+		t.Run("G="+strconv.Itoa(n), func(t *testing.T) {
+			var wg sync.WaitGroup
+			wg.Add(n)
+
+			in := make(chan bool, 1000)
+			for i := 0; i < n; i++ {
+				go func() {
+					<-in
+					wg.Done()
+				}()
+			}
+			buf := new(bytes.Buffer)
+			if err := Start(buf); err != nil {
+				t.Fatalf("failed to start tracing: %v", err)
+			}
+
+			for i := 0; i < n; i++ {
+				in <- true
+			}
+			wg.Wait()
+			Stop()
+
+			_, err := trace.Parse(buf, "")
+			if err == trace.ErrTimeOrder {
+				t.Skipf("skipping trace: %v", err)
+			}
+
+			if err != nil {
+				t.Fatalf("failed to parse trace: %v", err)
+			}
+		})
+	}
+}
+
 func TestTraceStartStop(t *testing.T) {
 	buf := new(bytes.Buffer)
 	if err := Start(buf); err != nil {
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 37c569887b0..79f78d8d247 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -52,7 +52,7 @@ func c_callers(skip int32, locbuf *location, max int32, keepThunks bool) int32
 // callers returns a stack trace of the current goroutine.
 // The gc version of callers takes []uintptr, but we take []location.
 func callers(skip int, locbuf []location) int {
-	n := c_callers(int32(skip), &locbuf[0], int32(len(locbuf)), false)
+	n := c_callers(int32(skip)+1, &locbuf[0], int32(len(locbuf)), false)
 	return int(n)
 }
 
@@ -156,7 +156,7 @@ func goroutineheader(gp *g) {
 	if waitfor >= 1 {
 		print(", ", waitfor, " minutes")
 	}
-	if gp.lockedm != nil {
+	if gp.lockedm != 0 {
 		print(", locked to thread")
 	}
 	print("]:\n")
author	Ian Lance Taylor <iant@golang.org>	2018-01-09 01:23:08 +0000
committer	Ian Lance Taylor <ian@gcc.gnu.org>	2018-01-09 01:23:08 +0000
commit	1a2f01efa63036a5104f203a4789e682c0e0915d (patch)
tree	373e15778dc8295354584e1f86915ae493b604ff /libgo/go/runtime
parent	8799df67f2dab88f9fda11739c501780a85575e2 (diff)