Skip to content

Commit 78100fe

Browse files
authored
feat(compute/metadata): allow canceling GCE detection (#11786)
* Add a new `OnGCEWithContext` function that allows passing in a cancellable context when performing runtime GCE platform detection. * Enable the caller to pass in a context. Previously there was a possibility that during GCE platform detection, the metadata address and the DNS resolution steps could simultaneously block, which prevents the function from returning, or can delay its return significantly. Context mitigates this as the caller can specify a deadline on the context or otherwise manage its cancellation.
1 parent 62f3416 commit 78100fe

File tree

5 files changed

+157
-79
lines changed

5 files changed

+157
-79
lines changed

β€Žcompute/metadata/metadata.go

Lines changed: 88 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -117,80 +117,18 @@ var (
117117
// NOTE: True returned from `OnGCE` does not guarantee that the metadata server
118118
// is accessible from this process and have all the metadata defined.
119119
func OnGCE() bool {
120-
onGCEOnce.Do(initOnGCE)
121-
return onGCE
122-
}
123-
124-
func initOnGCE() {
125-
onGCE = testOnGCE()
120+
return OnGCEWithContext(context.Background())
126121
}
127122

128-
func testOnGCE() bool {
129-
// The user explicitly said they're on GCE, so trust them.
130-
if os.Getenv(metadataHostEnv) != "" {
131-
return true
132-
}
133-
134-
ctx, cancel := context.WithCancel(context.Background())
135-
defer cancel()
136-
137-
resc := make(chan bool, 2)
138-
139-
// Try two strategies in parallel.
140-
// See https://github.com/googleapis/google-cloud-go/issues/194
141-
go func() {
142-
req, _ := http.NewRequest("GET", "http://"+metadataIP, nil)
143-
req.Header.Set("User-Agent", userAgent)
144-
res, err := newDefaultHTTPClient().Do(req.WithContext(ctx))
145-
if err != nil {
146-
resc <- false
147-
return
148-
}
149-
defer res.Body.Close()
150-
resc <- res.Header.Get("Metadata-Flavor") == "Google"
151-
}()
152-
153-
go func() {
154-
resolver := &net.Resolver{}
155-
addrs, err := resolver.LookupHost(ctx, "metadata.google.internal.")
156-
if err != nil || len(addrs) == 0 {
157-
resc <- false
158-
return
159-
}
160-
resc <- strsContains(addrs, metadataIP)
161-
}()
162-
163-
tryHarder := systemInfoSuggestsGCE()
164-
if tryHarder {
165-
res := <-resc
166-
if res {
167-
// The first strategy succeeded, so let's use it.
168-
return true
169-
}
170-
// Wait for either the DNS or metadata server probe to
171-
// contradict the other one and say we are running on
172-
// GCE. Give it a lot of time to do so, since the system
173-
// info already suggests we're running on a GCE BIOS.
174-
timer := time.NewTimer(5 * time.Second)
175-
defer timer.Stop()
176-
select {
177-
case res = <-resc:
178-
return res
179-
case <-timer.C:
180-
// Too slow. Who knows what this system is.
181-
return false
182-
}
183-
}
184-
185-
// There's no hint from the system info that we're running on
186-
// GCE, so use the first probe's result as truth, whether it's
187-
// true or false. The goal here is to optimize for speed for
188-
// users who are NOT running on GCE. We can't assume that
189-
// either a DNS lookup or an HTTP request to a blackholed IP
190-
// address is fast. Worst case this should return when the
191-
// metaClient's Transport.ResponseHeaderTimeout or
192-
// Transport.Dial.Timeout fires (in two seconds).
193-
return <-resc
123+
// OnGCEWithContext reports whether this process is running on Google Compute Platforms.
124+
// This function's return value is memoized for better performance.
125+
// NOTE: True returned from `OnGCEWithContext` does not guarantee that the metadata server
126+
// is accessible from this process and have all the metadata defined.
127+
func OnGCEWithContext(ctx context.Context) bool {
128+
onGCEOnce.Do(func() {
129+
onGCE = defaultClient.OnGCEWithContext(ctx)
130+
})
131+
return onGCE
194132
}
195133

196134
// Subscribe calls Client.SubscribeWithContext on the default client.
@@ -450,6 +388,84 @@ func NewWithOptions(opts *Options) *Client {
450388
return &Client{hc: client, logger: logger}
451389
}
452390

391+
// NOTE: metadataRequestStrategy is assigned to a variable for test stubbing purposes.
392+
var metadataRequestStrategy = func(ctx context.Context, httpClient *http.Client, resc chan bool) {
393+
req, _ := http.NewRequest("GET", "http://"+metadataIP, nil)
394+
req.Header.Set("User-Agent", userAgent)
395+
res, err := httpClient.Do(req.WithContext(ctx))
396+
if err != nil {
397+
resc <- false
398+
return
399+
}
400+
defer res.Body.Close()
401+
resc <- res.Header.Get("Metadata-Flavor") == "Google"
402+
}
403+
404+
// NOTE: dnsRequestStrategy is assigned to a variable for test stubbing purposes.
405+
var dnsRequestStrategy = func(ctx context.Context, resc chan bool) {
406+
resolver := &net.Resolver{}
407+
addrs, err := resolver.LookupHost(ctx, "metadata.google.internal.")
408+
if err != nil || len(addrs) == 0 {
409+
resc <- false
410+
return
411+
}
412+
resc <- strsContains(addrs, metadataIP)
413+
}
414+
415+
// OnGCEWithContext reports whether this process is running on Google Compute Platforms.
416+
// NOTE: True returned from `OnGCEWithContext` does not guarantee that the metadata server
417+
// is accessible from this process and have all the metadata defined.
418+
func (c *Client) OnGCEWithContext(ctx context.Context) bool {
419+
// The user explicitly said they're on GCE, so trust them.
420+
if os.Getenv(metadataHostEnv) != "" {
421+
return true
422+
}
423+
424+
ctx, cancel := context.WithCancel(ctx)
425+
defer cancel()
426+
427+
resc := make(chan bool, 2)
428+
429+
// Try two strategies in parallel.
430+
// See https://github.com/googleapis/google-cloud-go/issues/194
431+
go metadataRequestStrategy(ctx, c.hc, resc)
432+
go dnsRequestStrategy(ctx, resc)
433+
434+
tryHarder := systemInfoSuggestsGCE()
435+
if tryHarder {
436+
res := <-resc
437+
if res {
438+
// The first strategy succeeded, so let's use it.
439+
return true
440+
}
441+
442+
// Wait for either the DNS or metadata server probe to
443+
// contradict the other one and say we are running on
444+
// GCE. Give it a lot of time to do so, since the system
445+
// info already suggests we're running on a GCE BIOS.
446+
// Ensure cancellations from the calling context are respected.
447+
waitContext, cancelWait := context.WithTimeout(ctx, 5*time.Second)
448+
defer cancelWait()
449+
select {
450+
case res = <-resc:
451+
return res
452+
case <-waitContext.Done():
453+
// Too slow. Who knows what this system is.
454+
return false
455+
}
456+
}
457+
458+
// There's no hint from the system info that we're running on
459+
// GCE, so use the first probe's result as truth, whether it's
460+
// true or false. The goal here is to optimize for speed for
461+
// users who are NOT running on GCE. We can't assume that
462+
// either a DNS lookup or an HTTP request to a blackholed IP
463+
// address is fast. Worst case this should return when the
464+
// metaClient's Transport.ResponseHeaderTimeout or
465+
// Transport.Dial.Timeout fires (in two seconds).
466+
return <-resc
467+
}
468+
453469
// getETag returns a value from the metadata service as well as the associated ETag.
454470
// This func is otherwise equivalent to Get.
455471
func (c *Client) getETag(ctx context.Context, suffix string) (value, etag string, err error) {

β€Žcompute/metadata/metadata_test.go

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,15 @@ import (
2727
)
2828

2929
func TestOnGCE_Stress(t *testing.T) {
30+
ctx := context.Background()
3031
if testing.Short() {
3132
t.Skip("skipping in -short mode")
3233
}
3334
var last bool
3435
for i := 0; i < 100; i++ {
3536
onGCEOnce = sync.Once{}
3637

37-
now := OnGCE()
38+
now := OnGCEWithContext(ctx)
3839
if i > 0 && now != last {
3940
t.Errorf("%d. changed from %v to %v", i, last, now)
4041
}
@@ -44,15 +45,70 @@ func TestOnGCE_Stress(t *testing.T) {
4445
}
4546

4647
func TestOnGCE_Force(t *testing.T) {
48+
ctx := context.Background()
4749
onGCEOnce = sync.Once{}
4850
old := os.Getenv(metadataHostEnv)
4951
defer os.Setenv(metadataHostEnv, old)
5052
os.Setenv(metadataHostEnv, "127.0.0.1")
51-
if !OnGCE() {
53+
if !OnGCEWithContext(ctx) {
5254
t.Error("OnGCE() = false; want true")
5355
}
5456
}
5557

58+
func TestOnGCE_Cancel(t *testing.T) {
59+
ctx, cancel := context.WithCancel(context.Background())
60+
cancel()
61+
onGCEOnce = sync.Once{}
62+
if OnGCEWithContext(ctx) {
63+
t.Error("OnGCE() = true; want false")
64+
}
65+
}
66+
67+
func TestOnGCE_CancelTryHarder(t *testing.T) {
68+
// If system info suggests GCE, we allow extra time for the
69+
// probe with higher latency (HTTP or DNS) to return. In this
70+
// test, the system info suggest GCE, the DNS probe fails
71+
// immediately, and the HTTP probe would succeed after 750ms.
72+
// However, the user-provided context deadline is 500ms. GCE
73+
// detection should fail, respecting the provided context.
74+
//
75+
// NOTE: This code could create a data race if tests are run
76+
// in parallel.
77+
origSystemInfoSuggestsGCE := systemInfoSuggestsGCE
78+
origMetadataRequestStrategy := metadataRequestStrategy
79+
origDNSRequestStrategy := dnsRequestStrategy
80+
systemInfoSuggestsGCE = func() bool { return true }
81+
metadataRequestStrategy = func(_ context.Context, _ *http.Client, resc chan bool) {
82+
time.Sleep(750 * time.Millisecond)
83+
resc <- true
84+
}
85+
dnsRequestStrategy = func(_ context.Context, resc chan bool) {
86+
resc <- false
87+
}
88+
defer func() {
89+
systemInfoSuggestsGCE = origSystemInfoSuggestsGCE
90+
metadataRequestStrategy = origMetadataRequestStrategy
91+
dnsRequestStrategy = origDNSRequestStrategy
92+
}()
93+
94+
// Set deadline upper-limit to 500ms
95+
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
96+
defer cancel()
97+
98+
// Set HTTP deadline to 1s
99+
c := NewClient(&http.Client{Transport: sleepyTransport{1 * time.Second}})
100+
101+
start := time.Now()
102+
if c.OnGCEWithContext(ctx) {
103+
t.Error("OnGCE() = true; want false")
104+
}
105+
106+
// Should have returned around 500ms, but account for some scheduling budget
107+
if time.Now().Sub(start) > 510*time.Millisecond {
108+
t.Error("OnGCE() did not return within deadline")
109+
}
110+
}
111+
56112
func TestOverrideUserAgent(t *testing.T) {
57113
ctx := context.Background()
58114
const userAgent = "my-user-agent"
@@ -214,7 +270,7 @@ func TestClientGetWithContext(t *testing.T) {
214270
t.Run(tc.name, func(t *testing.T) {
215271
ctx, cancel := context.WithTimeout(context.Background(), tc.ctxTimeout)
216272
defer cancel()
217-
c := NewClient(&http.Client{Transport: sleepyTransport{}})
273+
c := NewClient(&http.Client{Transport: sleepyTransport{500 * time.Millisecond}})
218274
_, err := c.GetWithContext(ctx, "foo")
219275
if tc.wantErr && err == nil {
220276
t.Fatal("c.GetWithContext() == nil, want an error")
@@ -227,14 +283,15 @@ func TestClientGetWithContext(t *testing.T) {
227283
}
228284

229285
type sleepyTransport struct {
286+
delay time.Duration
230287
}
231288

232289
func (s sleepyTransport) RoundTrip(req *http.Request) (*http.Response, error) {
233290
req.Context().Done()
234291
select {
235292
case <-req.Context().Done():
236293
return nil, req.Context().Err()
237-
case <-time.After(500 * time.Millisecond):
294+
case <-time.After(s.delay):
238295
}
239296
return &http.Response{StatusCode: http.StatusOK, Body: io.NopCloser(strings.NewReader("I woke up"))}, nil
240297
}

β€Žcompute/metadata/syscheck.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ package metadata
2020
// doing network requests) suggests that we're running on GCE. If this
2121
// returns true, testOnGCE tries a bit harder to reach its metadata
2222
// server.
23-
func systemInfoSuggestsGCE() bool {
23+
//
24+
// NOTE: systemInfoSuggestsGCE is assigned to a varible for test stubbing purposes.
25+
var systemInfoSuggestsGCE = func() bool {
2426
// We don't currently have checks for other GOOS
2527
return false
2628
}

β€Žcompute/metadata/syscheck_linux.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ import (
2121
"strings"
2222
)
2323

24-
func systemInfoSuggestsGCE() bool {
24+
// NOTE: systemInfoSuggestsGCE is assigned to a varible for test stubbing purposes.
25+
var systemInfoSuggestsGCE = func() bool {
2526
b, _ := os.ReadFile("/sys/class/dmi/id/product_name")
27+
2628
name := strings.TrimSpace(string(b))
2729
return name == "Google" || name == "Google Compute Engine"
2830
}

β€Žcompute/metadata/syscheck_windows.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ import (
2222
"golang.org/x/sys/windows/registry"
2323
)
2424

25-
func systemInfoSuggestsGCE() bool {
25+
// NOTE: systemInfoSuggestsGCE is assigned to a varible for test stubbing purposes.
26+
var systemInfoSuggestsGCE = func() bool {
2627
k, err := registry.OpenKey(registry.LOCAL_MACHINE, `SYSTEM\HardwareConfig\Current`, registry.QUERY_VALUE)
2728
if err != nil {
2829
return false

0 commit comments

Comments
 (0)