Skip to content

Commit fb69a4f

Browse files
committed
[grid] Querying Node status only once at registration
We were querying the Node twice during its registration process, this saves a network call which could be relevant when many Nodes are being registrated.
1 parent d389fc0 commit fb69a4f

File tree

1 file changed

+46
-30
lines changed

1 file changed

+46
-30
lines changed

‎java/src/org/openqa/selenium/grid/distributor/local/LocalDistributor.java

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static com.google.common.collect.ImmutableSet.toImmutableSet;
2121
import static org.openqa.selenium.grid.data.Availability.DOWN;
2222
import static org.openqa.selenium.grid.data.Availability.DRAINING;
23+
import static org.openqa.selenium.grid.data.Availability.UP;
2324
import static org.openqa.selenium.internal.Debug.getDebugLogLevel;
2425
import static org.openqa.selenium.remote.RemoteTags.CAPABILITIES;
2526
import static org.openqa.selenium.remote.RemoteTags.CAPABILITIES_EVENT;
@@ -44,6 +45,7 @@
4445
import org.openqa.selenium.concurrent.GuardedRunnable;
4546
import org.openqa.selenium.events.EventBus;
4647
import org.openqa.selenium.grid.config.Config;
48+
import org.openqa.selenium.grid.data.Availability;
4749
import org.openqa.selenium.grid.data.CreateSessionRequest;
4850
import org.openqa.selenium.grid.data.CreateSessionResponse;
4951
import org.openqa.selenium.grid.data.DistributorStatus;
@@ -91,6 +93,7 @@
9193

9294
import java.io.Closeable;
9395
import java.io.UncheckedIOException;
96+
import java.net.URI;
9497
import java.time.Duration;
9598
import java.util.ArrayList;
9699
import java.util.Collection;
@@ -297,8 +300,10 @@ public LocalDistributor add(Node node) {
297300

298301
// An exception occurs if Node heartbeat has started but the server is not ready.
299302
// Unhandled exception blocks the event-bus thread from processing any event henceforth.
303+
NodeStatus initialNodeStatus;
300304
try {
301-
model.add(node.getStatus());
305+
initialNodeStatus = node.getStatus();
306+
model.add(initialNodeStatus);
302307
nodes.put(node.getId(), node);
303308
} catch (Exception e) {
304309
LOG.log(
@@ -309,21 +314,10 @@ public LocalDistributor add(Node node) {
309314
}
310315

311316
// Extract the health check
312-
Runnable runnableHealthCheck = asRunnableHealthCheck(node);
313-
allChecks.put(node.getId(), runnableHealthCheck);
314-
315-
// Running the health check right after the Node registers itself. We retry the
316-
// execution because the Node might on a complex network topology. For example,
317-
// Kubernetes pods with IPs that take a while before they are reachable.
318-
RetryPolicy<Object> initialHealthCheckPolicy = new RetryPolicy<>()
319-
.withMaxAttempts(-1)
320-
.withMaxDuration(Duration.ofSeconds(90))
321-
.withDelay(Duration.ofSeconds(15))
322-
.abortIf(result -> true);
323-
324-
LOG.log(getDebugLogLevel(), "Running initial health check for Node " + node.getUri());
325-
Executors.newSingleThreadExecutor().submit(
326-
() -> Failsafe.with(initialHealthCheckPolicy).run(runnableHealthCheck::run));
317+
Runnable healthCheck = asRunnableHealthCheck(node);
318+
allChecks.put(node.getId(), healthCheck);
319+
320+
updateNodeStatus(initialNodeStatus, healthCheck);
327321

328322
LOG.info(String.format(
329323
"Added node %s at %s. Health check every %ss",
@@ -336,6 +330,27 @@ public LocalDistributor add(Node node) {
336330
return this;
337331
}
338332

333+
private void updateNodeStatus(NodeStatus status, Runnable healthCheck) {
334+
// Setting the Node as available if the initial call to status was successful.
335+
// Otherwise, retry to have it available as soon as possible.
336+
if (status.getAvailability() == UP) {
337+
updateNodeAvailability(status.getExternalUri(), status.getNodeId(), status.getAvailability());
338+
} else {
339+
// Running the health check right after the Node registers itself. We retry the
340+
// execution because the Node might on a complex network topology. For example,
341+
// Kubernetes pods with IPs that take a while before they are reachable.
342+
RetryPolicy<Object> initialHealthCheckPolicy = new RetryPolicy<>()
343+
.withMaxAttempts(-1)
344+
.withMaxDuration(Duration.ofSeconds(90))
345+
.withDelay(Duration.ofSeconds(15))
346+
.abortIf(result -> true);
347+
348+
LOG.log(getDebugLogLevel(), "Running health check for Node " + status.getExternalUri());
349+
Executors.newSingleThreadExecutor().submit(
350+
() -> Failsafe.with(initialHealthCheckPolicy).run(healthCheck::run));
351+
}
352+
}
353+
339354
private Runnable runNodeHealthChecks() {
340355
return () -> {
341356
ImmutableMap<NodeId, Runnable> nodeHealthChecks = ImmutableMap.copyOf(allChecks);
@@ -363,26 +378,27 @@ private Runnable asRunnableHealthCheck(Node node) {
363378
failedCheckException = e;
364379
}
365380

366-
Lock writeLock = lock.writeLock();
367-
writeLock.lock();
368-
try {
369-
LOG.log(
370-
getDebugLogLevel(),
371-
String.format(
372-
"Health check result for %s was %s",
373-
node.getUri(),
374-
result.getAvailability()));
375-
model.setAvailability(id, result.getAvailability());
376-
model.updateHealthCheckCount(id, result.getAvailability());
377-
} finally {
378-
writeLock.unlock();
379-
}
381+
updateNodeAvailability(node.getUri(), id, result.getAvailability());
380382
if (checkFailed) {
381383
throw new HealthCheckFailedException("Node " + id, failedCheckException);
382384
}
383385
};
384386
}
385387

388+
private void updateNodeAvailability(URI nodeUri, NodeId id, Availability availability) {
389+
Lock writeLock = lock.writeLock();
390+
writeLock.lock();
391+
try {
392+
LOG.log(
393+
getDebugLogLevel(),
394+
String.format("Health check result for %s was %s", nodeUri, availability));
395+
model.setAvailability(id, availability);
396+
model.updateHealthCheckCount(id, availability);
397+
} finally {
398+
writeLock.unlock();
399+
}
400+
}
401+
386402
@Override
387403
public boolean drain(NodeId nodeId) {
388404
Node node = nodes.get(nodeId);

0 commit comments

Comments
 (0)