20
20
import static com .google .common .collect .ImmutableSet .toImmutableSet ;
21
21
import static org .openqa .selenium .grid .data .Availability .DOWN ;
22
22
import static org .openqa .selenium .grid .data .Availability .DRAINING ;
23
+ import static org .openqa .selenium .grid .data .Availability .UP ;
23
24
import static org .openqa .selenium .internal .Debug .getDebugLogLevel ;
24
25
import static org .openqa .selenium .remote .RemoteTags .CAPABILITIES ;
25
26
import static org .openqa .selenium .remote .RemoteTags .CAPABILITIES_EVENT ;
44
45
import org .openqa .selenium .concurrent .GuardedRunnable ;
45
46
import org .openqa .selenium .events .EventBus ;
46
47
import org .openqa .selenium .grid .config .Config ;
48
+ import org .openqa .selenium .grid .data .Availability ;
47
49
import org .openqa .selenium .grid .data .CreateSessionRequest ;
48
50
import org .openqa .selenium .grid .data .CreateSessionResponse ;
49
51
import org .openqa .selenium .grid .data .DistributorStatus ;
91
93
92
94
import java .io .Closeable ;
93
95
import java .io .UncheckedIOException ;
96
+ import java .net .URI ;
94
97
import java .time .Duration ;
95
98
import java .util .ArrayList ;
96
99
import java .util .Collection ;
@@ -297,8 +300,10 @@ public LocalDistributor add(Node node) {
297
300
298
301
// An exception occurs if Node heartbeat has started but the server is not ready.
299
302
// Unhandled exception blocks the event-bus thread from processing any event henceforth.
303
+ NodeStatus initialNodeStatus ;
300
304
try {
301
- model .add (node .getStatus ());
305
+ initialNodeStatus = node .getStatus ();
306
+ model .add (initialNodeStatus );
302
307
nodes .put (node .getId (), node );
303
308
} catch (Exception e ) {
304
309
LOG .log (
@@ -309,21 +314,10 @@ public LocalDistributor add(Node node) {
309
314
}
310
315
311
316
// Extract the health check
312
- Runnable runnableHealthCheck = asRunnableHealthCheck (node );
313
- allChecks .put (node .getId (), runnableHealthCheck );
314
-
315
- // Running the health check right after the Node registers itself. We retry the
316
- // execution because the Node might on a complex network topology. For example,
317
- // Kubernetes pods with IPs that take a while before they are reachable.
318
- RetryPolicy <Object > initialHealthCheckPolicy = new RetryPolicy <>()
319
- .withMaxAttempts (-1 )
320
- .withMaxDuration (Duration .ofSeconds (90 ))
321
- .withDelay (Duration .ofSeconds (15 ))
322
- .abortIf (result -> true );
323
-
324
- LOG .log (getDebugLogLevel (), "Running initial health check for Node " + node .getUri ());
325
- Executors .newSingleThreadExecutor ().submit (
326
- () -> Failsafe .with (initialHealthCheckPolicy ).run (runnableHealthCheck ::run ));
317
+ Runnable healthCheck = asRunnableHealthCheck (node );
318
+ allChecks .put (node .getId (), healthCheck );
319
+
320
+ updateNodeStatus (initialNodeStatus , healthCheck );
327
321
328
322
LOG .info (String .format (
329
323
"Added node %s at %s. Health check every %ss" ,
@@ -336,6 +330,27 @@ public LocalDistributor add(Node node) {
336
330
return this ;
337
331
}
338
332
333
+ private void updateNodeStatus (NodeStatus status , Runnable healthCheck ) {
334
+ // Setting the Node as available if the initial call to status was successful.
335
+ // Otherwise, retry to have it available as soon as possible.
336
+ if (status .getAvailability () == UP ) {
337
+ updateNodeAvailability (status .getExternalUri (), status .getNodeId (), status .getAvailability ());
338
+ } else {
339
+ // Running the health check right after the Node registers itself. We retry the
340
+ // execution because the Node might on a complex network topology. For example,
341
+ // Kubernetes pods with IPs that take a while before they are reachable.
342
+ RetryPolicy <Object > initialHealthCheckPolicy = new RetryPolicy <>()
343
+ .withMaxAttempts (-1 )
344
+ .withMaxDuration (Duration .ofSeconds (90 ))
345
+ .withDelay (Duration .ofSeconds (15 ))
346
+ .abortIf (result -> true );
347
+
348
+ LOG .log (getDebugLogLevel (), "Running health check for Node " + status .getExternalUri ());
349
+ Executors .newSingleThreadExecutor ().submit (
350
+ () -> Failsafe .with (initialHealthCheckPolicy ).run (healthCheck ::run ));
351
+ }
352
+ }
353
+
339
354
private Runnable runNodeHealthChecks () {
340
355
return () -> {
341
356
ImmutableMap <NodeId , Runnable > nodeHealthChecks = ImmutableMap .copyOf (allChecks );
@@ -363,26 +378,27 @@ private Runnable asRunnableHealthCheck(Node node) {
363
378
failedCheckException = e ;
364
379
}
365
380
366
- Lock writeLock = lock .writeLock ();
367
- writeLock .lock ();
368
- try {
369
- LOG .log (
370
- getDebugLogLevel (),
371
- String .format (
372
- "Health check result for %s was %s" ,
373
- node .getUri (),
374
- result .getAvailability ()));
375
- model .setAvailability (id , result .getAvailability ());
376
- model .updateHealthCheckCount (id , result .getAvailability ());
377
- } finally {
378
- writeLock .unlock ();
379
- }
381
+ updateNodeAvailability (node .getUri (), id , result .getAvailability ());
380
382
if (checkFailed ) {
381
383
throw new HealthCheckFailedException ("Node " + id , failedCheckException );
382
384
}
383
385
};
384
386
}
385
387
388
+ private void updateNodeAvailability (URI nodeUri , NodeId id , Availability availability ) {
389
+ Lock writeLock = lock .writeLock ();
390
+ writeLock .lock ();
391
+ try {
392
+ LOG .log (
393
+ getDebugLogLevel (),
394
+ String .format ("Health check result for %s was %s" , nodeUri , availability ));
395
+ model .setAvailability (id , availability );
396
+ model .updateHealthCheckCount (id , availability );
397
+ } finally {
398
+ writeLock .unlock ();
399
+ }
400
+ }
401
+
386
402
@ Override
387
403
public boolean drain (NodeId nodeId ) {
388
404
Node node = nodes .get (nodeId );
0 commit comments