KAFKA-15048: Improve handling of unexpected quorum controller errors (#13799)
When the active quorum controller encounters an "unexpected" error, such as a NullPointerException,
it currently resigns its leadership. This PR fixes it so that in addition to doing that, it also
increments the metadata error count metric. This will allow us to better track down these errors.
This PR also fixes a minor bug where performing read operations on a standby controller would
result in an unexpected RuntimeException. The bug happened because the standby controller does not
take in-memory snapshots, and read operations were attempting to read from the epoch of the latest
committed offset. The fix is for the standby controller to simply read the latest value of each
data structure. This is always safe, because standby controllers don't contain uncommitted data.
Also, fix a bug where listPartitionReassignments was reading the latest data, rather than data from
the last committed offset.
Reviewers: dengziming <dengziming1993@gmail.com>, David Arthur <mumrah@gmail.com>
@ -181,6 +177,7 @@ public final class QuorumController implements Controller {
@@ -181,6 +177,7 @@ public final class QuorumController implements Controller {
staticpublicclassBuilder{
privatefinalintnodeId;
privatefinalStringclusterId;
privateFaultHandlernonFatalFaultHandler=null;
privateFaultHandlerfatalFaultHandler=null;
privateTimetime=Time.SYSTEM;
privateStringthreadNamePrefix=null;
@ -209,6 +206,11 @@ public final class QuorumController implements Controller {
@@ -209,6 +206,11 @@ public final class QuorumController implements Controller {
@ -331,6 +333,8 @@ public final class QuorumController implements Controller {
@@ -331,6 +333,8 @@ public final class QuorumController implements Controller {
thrownewIllegalStateException("You must specify an initial metadata.version using the kafka-storage tool.");
}elseif(quorumFeatures==null){
thrownewIllegalStateException("You must specify the quorum features");
}elseif(nonFatalFaultHandler==null){
thrownewIllegalStateException("You must specify a non-fatal fault handler.");
}elseif(fatalFaultHandler==null){
thrownewIllegalStateException("You must specify a fatal fault handler.");
}
@ -349,6 +353,7 @@ public final class QuorumController implements Controller {
@@ -349,6 +353,7 @@ public final class QuorumController implements Controller {
@ -425,25 +430,23 @@ public final class QuorumController implements Controller {
@@ -425,25 +430,23 @@ public final class QuorumController implements Controller {
// The active controller keeps an in-memory snapshot at the last committed offset,
// which we want to read from when performing read operations. This will avoid
// reading uncommitted data.
returnlastCommittedOffset;
}else{
returnnewNotControllerException("No controller appears to be active.");
// Standby controllers never have uncommitted data in memory. Therefore, we always
// read the latest from every data structure.
returnSnapshotRegistry.LATEST_EPOCH;
}
}
@ -458,41 +461,35 @@ public final class QuorumController implements Controller {
@@ -458,41 +461,35 @@ public final class QuorumController implements Controller {
@ -703,11 +700,11 @@ public final class QuorumController implements Controller {
@@ -703,11 +700,11 @@ public final class QuorumController implements Controller {
@ -1320,7 +1317,8 @@ public final class QuorumController implements Controller {
@@ -1320,7 +1317,8 @@ public final class QuorumController implements Controller {
thrownewRuntimeException("Unable to find last committed offset "+
@ -1350,8 +1348,7 @@ public final class QuorumController implements Controller {
@@ -1350,8 +1348,7 @@ public final class QuorumController implements Controller {
log.error("Cancelling deferred write event {} because the event queue "+
"is now closed.",name);
returnnull;
@ -1590,6 +1587,11 @@ public final class QuorumController implements Controller {
@@ -1590,6 +1587,11 @@ public final class QuorumController implements Controller {
@ -1801,6 +1803,7 @@ public final class QuorumController implements Controller {
@@ -1801,6 +1803,7 @@ public final class QuorumController implements Controller {
privatefinalRecordRedactorrecordRedactor;
privateQuorumController(
FaultHandlernonFatalFaultHandler,
FaultHandlerfatalFaultHandler,
LogContextlogContext,
intnodeId,
@ -1826,6 +1829,7 @@ public final class QuorumController implements Controller {
@@ -1826,6 +1829,7 @@ public final class QuorumController implements Controller {
@ -1964,7 +1968,7 @@ public final class QuorumController implements Controller {
@@ -1964,7 +1968,7 @@ public final class QuorumController implements Controller {
@ -1972,7 +1976,7 @@ public final class QuorumController implements Controller {
@@ -1972,7 +1976,7 @@ public final class QuorumController implements Controller {
@ -1983,7 +1987,7 @@ public final class QuorumController implements Controller {
@@ -1983,7 +1987,7 @@ public final class QuorumController implements Controller {
@ -2003,7 +2007,7 @@ public final class QuorumController implements Controller {
@@ -2003,7 +2007,7 @@ public final class QuorumController implements Controller {
@ -2024,13 +2028,8 @@ public final class QuorumController implements Controller {
@@ -2024,13 +2028,8 @@ public final class QuorumController implements Controller {
@ -2075,7 +2074,7 @@ public final class QuorumController implements Controller {
@@ -2075,7 +2074,7 @@ public final class QuorumController implements Controller {
@ -43,6 +43,7 @@ public class QuorumControllerTestEnv implements AutoCloseable {
@@ -43,6 +43,7 @@ public class QuorumControllerTestEnv implements AutoCloseable {
@ -111,6 +112,9 @@ public class QuorumControllerTestEnv implements AutoCloseable {
@@ -111,6 +112,9 @@ public class QuorumControllerTestEnv implements AutoCloseable {
@ -165,5 +169,8 @@ public class QuorumControllerTestEnv implements AutoCloseable {
@@ -165,5 +169,8 @@ public class QuorumControllerTestEnv implements AutoCloseable {