Skip to content

Commit d8adc6c

Browse files
mknyszekgopherbot
authored andcommitted
[release-branch.go1.23] runtime: prevent weak->strong conversions during mark termination
Currently it's possible for weak->strong conversions to create more GC work during mark termination. When a weak->strong conversion happens during the mark phase, we need to mark the newly-strong pointer, since it may now be the only pointer to that object. In other words, the object could be white. But queueing new white objects creates GC work, and if this happens during mark termination, we could end up violating mark termination invariants. In the parlance of the mark termination algorithm, the weak->strong conversion is a non-monotonic source of GC work, unlike the write barriers (which will eventually only see black objects). This change fixes the problem by forcing weak->strong conversions to block during mark termination. We can do this efficiently by setting a global flag before the ragged barrier that is checked at each weak->strong conversion. If the flag is set, then the conversions block. The ragged barrier ensures that all Ps have observed the flag and that any weak->strong conversions which completed before the ragged barrier have their newly-minted strong pointers visible in GC work queues if necessary. We later unset the flag and wake all the blocked goroutines during the mark termination STW. There are a few subtleties that we need to account for. For one, it's possible that a goroutine which blocked in a weak->strong conversion wakes up only to find it's mark termination time again, so we need to recheck the global flag on wake. We should also stay non-preemptible while performing the check, so that if the check *does* appear as true, it cannot switch back to false while we're actively trying to block. If it switches to false while we try to block, then we'll be stuck in the queue until the following GC. All-in-all, this CL is more complicated than I would have liked, but it's the only idea so far that is clearly correct to me at a high level. This change adds a test which is somewhat invasive as it manipulates mark termination, but hopefully that infrastructure will be useful for debugging, fixing, and regression testing mark termination whenever we do fix it. For #69803. Fixes #70323. Change-Id: Ie314e6fd357c9e2a07a9be21f217f75f7aba8c4a Reviewed-on: https://go-review.googlesource.com/c/go/+/623615 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> (cherry picked from commit 80d306d) Reviewed-on: https://go-review.googlesource.com/c/go/+/627615 TryBot-Bypass: Dmitri Shuralyov <dmitshur@google.com> Auto-Submit: Dmitri Shuralyov <dmitshur@google.com>
1 parent 847cb6f commit d8adc6c

File tree

8 files changed

+340
-132
lines changed

8 files changed

+340
-132
lines changed

‎src/runtime/export_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,3 +1886,30 @@ func (m *TraceMap) PutString(s string) (uint64, bool) {
18861886
func (m *TraceMap) Reset() {
18871887
m.traceMap.reset()
18881888
}
1889+
1890+
func SetSpinInGCMarkDone(spin bool) {
1891+
gcDebugMarkDone.spinAfterRaggedBarrier.Store(spin)
1892+
}
1893+
1894+
func GCMarkDoneRestarted() bool {
1895+
// Only read this outside of the GC. If we're running during a GC, just report false.
1896+
mp := acquirem()
1897+
if gcphase != _GCoff {
1898+
releasem(mp)
1899+
return false
1900+
}
1901+
restarted := gcDebugMarkDone.restartedDueTo27993
1902+
releasem(mp)
1903+
return restarted
1904+
}
1905+
1906+
func GCMarkDoneResetRestartFlag() {
1907+
mp := acquirem()
1908+
for gcphase != _GCoff {
1909+
releasem(mp)
1910+
Gosched()
1911+
mp = acquirem()
1912+
}
1913+
gcDebugMarkDone.restartedDueTo27993 = false
1914+
releasem(mp)
1915+
}

‎src/runtime/gc_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package runtime_test
66

77
import (
88
"fmt"
9+
"internal/testenv"
10+
"internal/weak"
911
"math/bits"
1012
"math/rand"
1113
"os"
@@ -787,3 +789,78 @@ func TestMemoryLimitNoGCPercent(t *testing.T) {
787789
func TestMyGenericFunc(t *testing.T) {
788790
runtime.MyGenericFunc[int]()
789791
}
792+
793+
func TestWeakToStrongMarkTermination(t *testing.T) {
794+
testenv.MustHaveParallelism(t)
795+
796+
type T struct {
797+
a *int
798+
b int
799+
}
800+
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
801+
defer debug.SetGCPercent(debug.SetGCPercent(-1))
802+
w := make([]weak.Pointer[T], 2048)
803+
804+
// Make sure there's no out-standing GC from a previous test.
805+
runtime.GC()
806+
807+
// Create many objects with a weak pointers to them.
808+
for i := range w {
809+
x := new(T)
810+
x.a = new(int)
811+
w[i] = weak.Make(x)
812+
}
813+
814+
// Reset the restart flag.
815+
runtime.GCMarkDoneResetRestartFlag()
816+
817+
// Prevent mark termination from completing.
818+
runtime.SetSpinInGCMarkDone(true)
819+
820+
// Start a GC, and wait a little bit to get something spinning in mark termination.
821+
// Simultaneously, fire off another goroutine to disable spinning. If everything's
822+
// working correctly, then weak.Strong will block, so we need to make sure something
823+
// prevents the GC from continuing to spin.
824+
done := make(chan struct{})
825+
go func() {
826+
runtime.GC()
827+
done <- struct{}{}
828+
}()
829+
go func() {
830+
time.Sleep(100 * time.Millisecond)
831+
832+
// Let mark termination continue.
833+
runtime.SetSpinInGCMarkDone(false)
834+
}()
835+
time.Sleep(10 * time.Millisecond)
836+
837+
// Perform many weak->strong conversions in the critical window.
838+
var wg sync.WaitGroup
839+
for _, wp := range w {
840+
wg.Add(1)
841+
go func() {
842+
defer wg.Done()
843+
wp.Strong()
844+
}()
845+
}
846+
847+
// Make sure the GC completes.
848+
<-done
849+
850+
// Make sure all the weak->strong conversions finish.
851+
wg.Wait()
852+
853+
// The bug is triggered if there's still mark work after gcMarkDone stops the world.
854+
//
855+
// This can manifest in one of two ways today:
856+
// - An exceedingly rare crash in mark termination.
857+
// - gcMarkDone restarts, as if issue #27993 is at play.
858+
//
859+
// Check for the latter. This is a fairly controlled environment, so #27993 is very
860+
// unlikely to happen (it's already rare to begin with) but we'll always _appear_ to
861+
// trigger the same bug if weak->strong conversions aren't properly coordinated with
862+
// mark termination.
863+
if runtime.GCMarkDoneRestarted() {
864+
t.Errorf("gcMarkDone restarted")
865+
}
866+
}

‎src/runtime/lockrank.go

Lines changed: 119 additions & 116 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎src/runtime/mgc.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ func gcinit() {
190190
work.markDoneSema = 1
191191
lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
192192
lockInit(&work.assistQueue.lock, lockRankAssistQueue)
193+
lockInit(&work.strongFromWeak.lock, lockRankStrongFromWeakQueue)
193194
lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
194195
}
195196

@@ -418,6 +419,26 @@ type workType struct {
418419
list gList
419420
}
420421

422+
// strongFromWeak controls how the GC interacts with weak->strong
423+
// pointer conversions.
424+
strongFromWeak struct {
425+
// block is a flag set during mark termination that prevents
426+
// new weak->strong conversions from executing by blocking the
427+
// goroutine and enqueuing it onto q.
428+
//
429+
// Mutated only by one goroutine at a time in gcMarkDone,
430+
// with globally-synchronizing events like forEachP and
431+
// stopTheWorld.
432+
block bool
433+
434+
// q is a queue of goroutines that attempted to perform a
435+
// weak->strong conversion during mark termination.
436+
//
437+
// Protected by lock.
438+
lock mutex
439+
q gQueue
440+
}
441+
421442
// cycles is the number of completed GC cycles, where a GC
422443
// cycle is sweep termination, mark, mark termination, and
423444
// sweep. This differs from memstats.numgc, which is
@@ -800,6 +821,19 @@ func gcStart(trigger gcTrigger) {
800821
// This is protected by markDoneSema.
801822
var gcMarkDoneFlushed uint32
802823

824+
// gcDebugMarkDone contains fields used to debug/test mark termination.
825+
var gcDebugMarkDone struct {
826+
// spinAfterRaggedBarrier forces gcMarkDone to spin after it executes
827+
// the ragged barrier.
828+
spinAfterRaggedBarrier atomic.Bool
829+
830+
// restartedDueTo27993 indicates that we restarted mark termination
831+
// due to the bug described in issue #27993.
832+
//
833+
// Protected by worldsema.
834+
restartedDueTo27993 bool
835+
}
836+
803837
// gcMarkDone transitions the GC from mark to mark termination if all
804838
// reachable objects have been marked (that is, there are no grey
805839
// objects and can be no more in the future). Otherwise, it flushes
@@ -842,6 +876,10 @@ top:
842876
// stop the world later, so acquire worldsema now.
843877
semacquire(&worldsema)
844878

879+
// Prevent weak->strong conversions from generating additional
880+
// GC work. forEachP will guarantee that it is observed globally.
881+
work.strongFromWeak.block = true
882+
845883
// Flush all local buffers and collect flushedWork flags.
846884
gcMarkDoneFlushed = 0
847885
forEachP(waitReasonGCMarkTermination, func(pp *p) {
@@ -872,6 +910,10 @@ top:
872910
goto top
873911
}
874912

913+
// For debugging/testing.
914+
for gcDebugMarkDone.spinAfterRaggedBarrier.Load() {
915+
}
916+
875917
// There was no global work, no local work, and no Ps
876918
// communicated work since we took markDoneSema. Therefore
877919
// there are no grey objects and no more objects can be
@@ -910,6 +952,8 @@ top:
910952
}
911953
})
912954
if restart {
955+
gcDebugMarkDone.restartedDueTo27993 = true
956+
913957
getg().m.preemptoff = ""
914958
systemstack(func() {
915959
// Accumulate the time we were stopped before we had to start again.
@@ -936,6 +980,11 @@ top:
936980
// start the world again.
937981
gcWakeAllAssists()
938982

983+
// Wake all blocked weak->strong conversions. These will run
984+
// when we start the world again.
985+
work.strongFromWeak.block = false
986+
gcWakeAllStrongFromWeak()
987+
939988
// Likewise, release the transition lock. Blocked
940989
// workers and assists will run when we start the
941990
// world again.

‎src/runtime/mheap.go

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2049,8 +2049,19 @@ func internal_weak_runtime_registerWeakPointer(p unsafe.Pointer) unsafe.Pointer
20492049
func internal_weak_runtime_makeStrongFromWeak(u unsafe.Pointer) unsafe.Pointer {
20502050
handle := (*atomic.Uintptr)(u)
20512051

2052-
// Prevent preemption. We want to make sure that another GC cycle can't start.
2052+
// Prevent preemption. We want to make sure that another GC cycle can't start
2053+
// and that work.strongFromWeak.block can't change out from under us.
20532054
mp := acquirem()
2055+
2056+
// Yield to the GC if necessary.
2057+
if work.strongFromWeak.block {
2058+
releasem(mp)
2059+
2060+
// Try to park and wait for mark termination.
2061+
// N.B. gcParkStrongFromWeak calls acquirem before returning.
2062+
mp = gcParkStrongFromWeak()
2063+
}
2064+
20542065
p := handle.Load()
20552066
if p == 0 {
20562067
releasem(mp)
@@ -2092,6 +2103,41 @@ func internal_weak_runtime_makeStrongFromWeak(u unsafe.Pointer) unsafe.Pointer {
20922103
return ptr
20932104
}
20942105

2106+
// gcParkStrongFromWeak puts the current goroutine on the weak->strong queue and parks.
2107+
func gcParkStrongFromWeak() *m {
2108+
// Prevent preemption as we check strongFromWeak, so it can't change out from under us.
2109+
mp := acquirem()
2110+
2111+
for work.strongFromWeak.block {
2112+
lock(&work.strongFromWeak.lock)
2113+
releasem(mp) // N.B. Holding the lock prevents preemption.
2114+
2115+
// Queue ourselves up.
2116+
work.strongFromWeak.q.pushBack(getg())
2117+
2118+
// Park.
2119+
goparkunlock(&work.strongFromWeak.lock, waitReasonGCWeakToStrongWait, traceBlockGCWeakToStrongWait, 2)
2120+
2121+
// Re-acquire the current M since we're going to check the condition again.
2122+
mp = acquirem()
2123+
2124+
// Re-check condition. We may have awoken in the next GC's mark termination phase.
2125+
}
2126+
return mp
2127+
}
2128+
2129+
// gcWakeAllStrongFromWeak wakes all currently blocked weak->strong
2130+
// conversions. This is used at the end of a GC cycle.
2131+
//
2132+
// work.strongFromWeak.block must be false to prevent woken goroutines
2133+
// from immediately going back to sleep.
2134+
func gcWakeAllStrongFromWeak() {
2135+
lock(&work.strongFromWeak.lock)
2136+
list := work.strongFromWeak.q.popList()
2137+
injectglist(&list)
2138+
unlock(&work.strongFromWeak.lock)
2139+
}
2140+
20952141
// Retrieves or creates a weak pointer handle for the object p.
20962142
func getOrAddWeakHandle(p unsafe.Pointer) *atomic.Uintptr {
20972143
// First try to retrieve without allocating.

‎src/runtime/mklockrank.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ NONE < defer;
5050
NONE <
5151
sweepWaiters,
5252
assistQueue,
53+
strongFromWeakQueue,
5354
sweep;
5455
5556
# Test only
@@ -66,6 +67,7 @@ assistQueue,
6667
hchan,
6768
pollDesc, # pollDesc can interact with timers, which can lock sched.
6869
scavenge,
70+
strongFromWeakQueue,
6971
sweep,
7072
sweepWaiters,
7173
testR,

‎src/runtime/runtime2.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,7 @@ const (
10951095
waitReasonTraceProcStatus // "trace proc status"
10961096
waitReasonPageTraceFlush // "page trace flush"
10971097
waitReasonCoroutine // "coroutine"
1098+
waitReasonGCWeakToStrongWait // "GC weak to strong wait"
10981099
)
10991100

11001101
var waitReasonStrings = [...]string{
@@ -1135,6 +1136,7 @@ var waitReasonStrings = [...]string{
11351136
waitReasonTraceProcStatus: "trace proc status",
11361137
waitReasonPageTraceFlush: "page trace flush",
11371138
waitReasonCoroutine: "coroutine",
1139+
waitReasonGCWeakToStrongWait: "GC weak to strong wait",
11381140
}
11391141

11401142
func (w waitReason) String() string {

‎src/runtime/traceruntime.go

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -99,24 +99,26 @@ const (
9999
traceBlockDebugCall
100100
traceBlockUntilGCEnds
101101
traceBlockSleep
102+
traceBlockGCWeakToStrongWait
102103
)
103104

104105
var traceBlockReasonStrings = [...]string{
105-
traceBlockGeneric: "unspecified",
106-
traceBlockForever: "forever",
107-
traceBlockNet: "network",
108-
traceBlockSelect: "select",
109-
traceBlockCondWait: "sync.(*Cond).Wait",
110-
traceBlockSync: "sync",
111-
traceBlockChanSend: "chan send",
112-
traceBlockChanRecv: "chan receive",
113-
traceBlockGCMarkAssist: "GC mark assist wait for work",
114-
traceBlockGCSweep: "GC background sweeper wait",
115-
traceBlockSystemGoroutine: "system goroutine wait",
116-
traceBlockPreempted: "preempted",
117-
traceBlockDebugCall: "wait for debug call",
118-
traceBlockUntilGCEnds: "wait until GC ends",
119-
traceBlockSleep: "sleep",
106+
traceBlockGeneric: "unspecified",
107+
traceBlockForever: "forever",
108+
traceBlockNet: "network",
109+
traceBlockSelect: "select",
110+
traceBlockCondWait: "sync.(*Cond).Wait",
111+
traceBlockSync: "sync",
112+
traceBlockChanSend: "chan send",
113+
traceBlockChanRecv: "chan receive",
114+
traceBlockGCMarkAssist: "GC mark assist wait for work",
115+
traceBlockGCSweep: "GC background sweeper wait",
116+
traceBlockSystemGoroutine: "system goroutine wait",
117+
traceBlockPreempted: "preempted",
118+
traceBlockDebugCall: "wait for debug call",
119+
traceBlockUntilGCEnds: "wait until GC ends",
120+
traceBlockSleep: "sleep",
121+
traceBlockGCWeakToStrongWait: "GC weak to strong wait",
120122
}
121123

122124
// traceGoStopReason is an enumeration of reasons a goroutine might yield.

0 commit comments

Comments
 (0)