Skip to content

Commit f4118a5

Browse files
author
Bryan C. Mills
committed
acme/autocert: fix races in renewal tests
TestRenewFromCache and TestRenewFromCacheAlreadyRenewed had several races and API misuses: 1. They called t.Fatalf from a goroutine other than the one invoking the Test function, which is explicitly disallowed (see https://pkg.go.dev/testing#T). 2. The test did not stop the renewal timers prior to restoring test-hook functions, and the process of stopping the renewal timers itself did not wait for in-flight calls to complete. That could cause data races if one of the renewals failed and triggered a retry with a short-enough randomized backoff. (One such race was observed in https://build.golang.org/log/1a19e22ad826bedeb5a939c6130f368f9979208a.) 3. The testDidRenewLoop hooks accessed the Manager.renewal field without locking the Mutex guarding that field. 4. TestGetCertificate_failedAttempt set a testDidRemoveState hook, but didn't wait for the timers referring to that hook to complete before restoring it, causing races with other timers. I tried pulling on that thread a bit, but couldn't untangle the numerous untracked goroutines in the package. Instead, I have made a smaller and more local change to copy the value of testDidRemoveState into a local variable in the timer's closure. Given the number of untracked goroutines in this package, it is likely that races and/or deadlocks remain. Notably, so far I have been unable to spot the actual cause of golang/go#51080. For golang/go#51080 Change-Id: I7797f6ac34ef3c272f16ca805251dac3aa7f0009 Reviewed-on: https://go-review.googlesource.com/c/crypto/+/384594 Trust: Bryan Mills <bcmills@google.com> Run-TryBot: Bryan Mills <bcmills@google.com> Reviewed-by: Filippo Valsorda <filippo@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org>
1 parent db63837 commit f4118a5

File tree

3 files changed

+98
-58
lines changed

3 files changed

+98
-58
lines changed

acme/autocert/autocert.go

+6-5
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ func (m *Manager) cert(ctx context.Context, ck certKey) (*tls.Certificate, error
458458
leaf: cert.Leaf,
459459
}
460460
m.state[ck] = s
461-
go m.renew(ck, s.key, s.leaf.NotAfter)
461+
go m.startRenew(ck, s.key, s.leaf.NotAfter)
462462
return cert, nil
463463
}
464464

@@ -584,8 +584,9 @@ func (m *Manager) createCert(ctx context.Context, ck certKey) (*tls.Certificate,
584584
if err != nil {
585585
// Remove the failed state after some time,
586586
// making the manager call createCert again on the following TLS hello.
587+
didRemove := testDidRemoveState // The lifetime of this timer is untracked, so copy mutable local state to avoid races.
587588
time.AfterFunc(createCertRetryAfter, func() {
588-
defer testDidRemoveState(ck)
589+
defer didRemove(ck)
589590
m.stateMu.Lock()
590591
defer m.stateMu.Unlock()
591592
// Verify the state hasn't changed and it's still invalid
@@ -603,7 +604,7 @@ func (m *Manager) createCert(ctx context.Context, ck certKey) (*tls.Certificate,
603604
}
604605
state.cert = der
605606
state.leaf = leaf
606-
go m.renew(ck, state.key, state.leaf.NotAfter)
607+
go m.startRenew(ck, state.key, state.leaf.NotAfter)
607608
return state.tlscert()
608609
}
609610

@@ -893,15 +894,15 @@ func httpTokenCacheKey(tokenPath string) string {
893894
return path.Base(tokenPath) + "+http-01"
894895
}
895896

896-
// renew starts a cert renewal timer loop, one per domain.
897+
// startRenew starts a cert renewal timer loop, one per domain.
897898
//
898899
// The loop is scheduled in two cases:
899900
// - a cert was fetched from cache for the first time (wasn't in m.state)
900901
// - a new cert was created by m.createCert
901902
//
902903
// The key argument is a certificate private key.
903904
// The exp argument is the cert expiration time (NotAfter).
904-
func (m *Manager) renew(ck certKey, key crypto.Signer, exp time.Time) {
905+
func (m *Manager) startRenew(ck certKey, key crypto.Signer, exp time.Time) {
905906
m.renewalMu.Lock()
906907
defer m.renewalMu.Unlock()
907908
if m.renewal[ck] != nil {

acme/autocert/renewal.go

+25-10
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ type domainRenewal struct {
2121
ck certKey
2222
key crypto.Signer
2323

24-
timerMu sync.Mutex
25-
timer *time.Timer
24+
timerMu sync.Mutex
25+
timer *time.Timer
26+
timerClose chan struct{} // if non-nil, renew closes this channel (and nils out the timer fields) instead of running
2627
}
2728

2829
// start starts a cert renewal timer at the time
@@ -38,24 +39,38 @@ func (dr *domainRenewal) start(exp time.Time) {
3839
dr.timer = time.AfterFunc(dr.next(exp), dr.renew)
3940
}
4041

41-
// stop stops the cert renewal timer.
42-
// If the timer is already stopped, calling stop is a noop.
42+
// stop stops the cert renewal timer and waits for any in-flight calls to renew
43+
// to complete. If the timer is already stopped, calling stop is a noop.
4344
func (dr *domainRenewal) stop() {
4445
dr.timerMu.Lock()
4546
defer dr.timerMu.Unlock()
46-
if dr.timer == nil {
47-
return
47+
for {
48+
if dr.timer == nil {
49+
return
50+
}
51+
if dr.timer.Stop() {
52+
dr.timer = nil
53+
return
54+
} else {
55+
// dr.timer fired, and we acquired dr.timerMu before the renew callback did.
56+
// (We know this because otherwise the renew callback would have reset dr.timer!)
57+
timerClose := make(chan struct{})
58+
dr.timerClose = timerClose
59+
dr.timerMu.Unlock()
60+
<-timerClose
61+
dr.timerMu.Lock()
62+
}
4863
}
49-
dr.timer.Stop()
50-
dr.timer = nil
5164
}
5265

5366
// renew is called periodically by a timer.
5467
// The first renew call is kicked off by dr.start.
5568
func (dr *domainRenewal) renew() {
5669
dr.timerMu.Lock()
5770
defer dr.timerMu.Unlock()
58-
if dr.timer == nil {
71+
if dr.timerClose != nil {
72+
close(dr.timerClose)
73+
dr.timer, dr.timerClose = nil, nil
5974
return
6075
}
6176

@@ -67,8 +82,8 @@ func (dr *domainRenewal) renew() {
6782
next = renewJitter / 2
6883
next += time.Duration(pseudoRand.int63n(int64(next)))
6984
}
70-
dr.timer = time.AfterFunc(next, dr.renew)
7185
testDidRenewLoop(next, err)
86+
dr.timer = time.AfterFunc(next, dr.renew)
7287
}
7388

7489
// updateState locks and replaces the relevant Manager.state item with the given

acme/autocert/renewal_test.go

+67-43
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,23 @@ func TestRenewFromCache(t *testing.T) {
6161

6262
// verify the renewal happened
6363
defer func() {
64+
// Stop the timers that read and execute testDidRenewLoop before restoring it.
65+
// Otherwise the timer callback may race with the deferred write.
66+
man.stopRenew()
6467
testDidRenewLoop = func(next time.Duration, err error) {}
6568
}()
66-
done := make(chan struct{})
69+
renewed := make(chan bool, 1)
6770
testDidRenewLoop = func(next time.Duration, err error) {
68-
defer close(done)
71+
defer func() {
72+
select {
73+
case renewed <- true:
74+
default:
75+
// The renewal timer uses a random backoff. If the first renewal fails for
76+
// some reason, we could end up with multiple calls here before the test
77+
// stops the timer.
78+
}
79+
}()
80+
6981
if err != nil {
7082
t.Errorf("testDidRenewLoop: %v", err)
7183
}
@@ -81,7 +93,8 @@ func TestRenewFromCache(t *testing.T) {
8193
after := time.Now().Add(future)
8294
tlscert, err := man.cacheGet(context.Background(), exampleCertKey)
8395
if err != nil {
84-
t.Fatalf("man.cacheGet: %v", err)
96+
t.Errorf("man.cacheGet: %v", err)
97+
return
8598
}
8699
if !tlscert.Leaf.NotAfter.After(after) {
87100
t.Errorf("cache leaf.NotAfter = %v; want > %v", tlscert.Leaf.NotAfter, after)
@@ -92,11 +105,13 @@ func TestRenewFromCache(t *testing.T) {
92105
defer man.stateMu.Unlock()
93106
s := man.state[exampleCertKey]
94107
if s == nil {
95-
t.Fatalf("m.state[%q] is nil", exampleCertKey)
108+
t.Errorf("m.state[%q] is nil", exampleCertKey)
109+
return
96110
}
97111
tlscert, err = s.tlscert()
98112
if err != nil {
99-
t.Fatalf("s.tlscert: %v", err)
113+
t.Errorf("s.tlscert: %v", err)
114+
return
100115
}
101116
if !tlscert.Leaf.NotAfter.After(after) {
102117
t.Errorf("state leaf.NotAfter = %v; want > %v", tlscert.Leaf.NotAfter, after)
@@ -108,13 +123,7 @@ func TestRenewFromCache(t *testing.T) {
108123
if _, err := man.GetCertificate(hello); err != nil {
109124
t.Fatal(err)
110125
}
111-
112-
// wait for renew loop
113-
select {
114-
case <-time.After(10 * time.Second):
115-
t.Fatal("renew took too long to occur")
116-
case <-done:
117-
}
126+
<-renewed
118127
}
119128

120129
func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
@@ -159,11 +168,23 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
159168

160169
// verify the renewal accepted the newer cached cert
161170
defer func() {
171+
// Stop the timers that read and execute testDidRenewLoop before restoring it.
172+
// Otherwise the timer callback may race with the deferred write.
173+
man.stopRenew()
162174
testDidRenewLoop = func(next time.Duration, err error) {}
163175
}()
164-
done := make(chan struct{})
176+
renewed := make(chan bool, 1)
165177
testDidRenewLoop = func(next time.Duration, err error) {
166-
defer close(done)
178+
defer func() {
179+
select {
180+
case renewed <- true:
181+
default:
182+
// The renewal timer uses a random backoff. If the first renewal fails for
183+
// some reason, we could end up with multiple calls here before the test
184+
// stops the timer.
185+
}
186+
}()
187+
167188
if err != nil {
168189
t.Errorf("testDidRenewLoop: %v", err)
169190
}
@@ -177,7 +198,8 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
177198
// ensure the cached cert was not modified
178199
tlscert, err := man.cacheGet(context.Background(), exampleCertKey)
179200
if err != nil {
180-
t.Fatalf("man.cacheGet: %v", err)
201+
t.Errorf("man.cacheGet: %v", err)
202+
return
181203
}
182204
if !tlscert.Leaf.NotAfter.Equal(newLeaf.NotAfter) {
183205
t.Errorf("cache leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
@@ -188,30 +210,22 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
188210
defer man.stateMu.Unlock()
189211
s := man.state[exampleCertKey]
190212
if s == nil {
191-
t.Fatalf("m.state[%q] is nil", exampleCertKey)
213+
t.Errorf("m.state[%q] is nil", exampleCertKey)
214+
return
192215
}
193216
stateKey := s.key.Public().(*ecdsa.PublicKey)
194217
if !stateKey.Equal(newLeaf.PublicKey) {
195-
t.Fatal("state key was not updated from cache")
218+
t.Error("state key was not updated from cache")
219+
return
196220
}
197221
tlscert, err = s.tlscert()
198222
if err != nil {
199-
t.Fatalf("s.tlscert: %v", err)
223+
t.Errorf("s.tlscert: %v", err)
224+
return
200225
}
201226
if !tlscert.Leaf.NotAfter.Equal(newLeaf.NotAfter) {
202227
t.Errorf("state leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
203228
}
204-
205-
// verify the private key is replaced in the renewal state
206-
r := man.renewal[exampleCertKey]
207-
if r == nil {
208-
t.Fatalf("m.renewal[%q] is nil", exampleCertKey)
209-
}
210-
renewalKey := r.key.Public().(*ecdsa.PublicKey)
211-
if !renewalKey.Equal(newLeaf.PublicKey) {
212-
t.Fatal("renewal private key was not updated from cache")
213-
}
214-
215229
}
216230

217231
// assert the expiring cert is returned from state
@@ -225,21 +239,31 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
225239
}
226240

227241
// trigger renew
228-
go man.renew(exampleCertKey, s.key, s.leaf.NotAfter)
229-
230-
// wait for renew loop
231-
select {
232-
case <-time.After(10 * time.Second):
233-
t.Fatal("renew took too long to occur")
234-
case <-done:
235-
// assert the new cert is returned from state after renew
236-
hello := clientHelloInfo(exampleDomain, algECDSA)
237-
tlscert, err := man.GetCertificate(hello)
238-
if err != nil {
239-
t.Fatal(err)
242+
man.startRenew(exampleCertKey, s.key, s.leaf.NotAfter)
243+
<-renewed
244+
func() {
245+
man.renewalMu.Lock()
246+
defer man.renewalMu.Unlock()
247+
248+
// verify the private key is replaced in the renewal state
249+
r := man.renewal[exampleCertKey]
250+
if r == nil {
251+
t.Errorf("m.renewal[%q] is nil", exampleCertKey)
252+
return
240253
}
241-
if !newLeaf.NotAfter.Equal(tlscert.Leaf.NotAfter) {
242-
t.Errorf("state leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
254+
renewalKey := r.key.Public().(*ecdsa.PublicKey)
255+
if !renewalKey.Equal(newLeaf.PublicKey) {
256+
t.Error("renewal private key was not updated from cache")
243257
}
258+
}()
259+
260+
// assert the new cert is returned from state after renew
261+
hello = clientHelloInfo(exampleDomain, algECDSA)
262+
tlscert, err = man.GetCertificate(hello)
263+
if err != nil {
264+
t.Fatal(err)
265+
}
266+
if !newLeaf.NotAfter.Equal(tlscert.Leaf.NotAfter) {
267+
t.Errorf("state leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
244268
}
245269
}

0 commit comments

Comments
 (0)