KAFKA-7672 : force write checkpoint during StreamTask #suspend (#6115)
This fix is aiming for #2 issue pointed out within https://issues.apache.org/jira/browse/KAFKA-7672
In the current setup, we do offset checkpoint file write when EOS is turned on during #suspend, which introduces the potential race condition during StateManager #closeSuspend call. To mitigate the problem, we attempt to always write checkpoint file in #suspend call.
Reviewers: Guozhang Wang <wangguoz@gmail.com>, Matthias J. Sax <mjsax@apache.org>, John Roesler <john@confluent.io>, Bill Bejeck <bbejeck@gmail.com>
@ -105,7 +105,7 @@ public class GlobalStateUpdateTask implements GlobalStateMaintainer {
@@ -105,7 +105,7 @@ public class GlobalStateUpdateTask implements GlobalStateMaintainer {
@ -89,6 +89,9 @@ public class ProcessorStateManager extends AbstractStateManager {
@@ -89,6 +89,9 @@ public class ProcessorStateManager extends AbstractStateManager {
// load the checkpoint information
checkpointableOffsets.putAll(checkpoint.read());
log.trace("Checkpointable offsets read from checkpoint: {}",checkpointableOffsets);
if(eosEnabled){
// delete the checkpoint file after finish loading its stored offsets
checkpoint.delete();
@ -140,7 +143,7 @@ public class ProcessorStateManager extends AbstractStateManager {
@@ -140,7 +143,7 @@ public class ProcessorStateManager extends AbstractStateManager {
restoreCallbacks.put(topic,stateRestoreCallback);
recordConverters.put(topic,recordConverter);
}else{
log.trace("Restoring state store {} from changelog topic {}",storeName,topic);
log.trace("Restoring state store {} from changelog topic {} at checkpoint {}",storeName,topic,checkpointableOffsets.get(storePartition));
finalStateRestorerrestorer=newStateRestorer(
storePartition,
@ -254,7 +257,7 @@ public class ProcessorStateManager extends AbstractStateManager {
@@ -254,7 +257,7 @@ public class ProcessorStateManager extends AbstractStateManager {
// attempting to close the stores, just in case they
// are not closed by a ProcessorNode yet
@ -271,11 +274,17 @@ public class ProcessorStateManager extends AbstractStateManager {
@@ -271,11 +274,17 @@ public class ProcessorStateManager extends AbstractStateManager {
log.error("Failed to close state store {}: ",store.name(),e);
}
}
stores.clear();
}
if(ackedOffsets!=null){
checkpoint(ackedOffsets);
if(!clean&&eosEnabled&&checkpoint!=null){
// delete the checkpoint file if this is an unclean close
try{
checkpoint.delete();
checkpoint=null;
}catch(finalIOExceptione){
thrownewProcessorStateException(String.format("%sError while deleting the checkpoint file",logPrefix),e);
}
stores.clear();
}
if(firstException!=null){
@ -287,6 +296,7 @@ public class ProcessorStateManager extends AbstractStateManager {
@@ -287,6 +296,7 @@ public class ProcessorStateManager extends AbstractStateManager {
log.trace("Checkpointable offsets updated with restored offsets: {}",this.checkpointableOffsets);
for(finalStateStorestore:stores.values()){
finalStringstoreName=store.name();
// only checkpoint the offset to the offsets file if
@ -302,6 +312,9 @@ public class ProcessorStateManager extends AbstractStateManager {
@@ -302,6 +312,9 @@ public class ProcessorStateManager extends AbstractStateManager {
}
}
}
log.trace("Checkpointable offsets updated with active acked offsets: {}",this.checkpointableOffsets);
@ -195,6 +195,8 @@ public class StoreChangelogReader implements ChangelogReader {
@@ -195,6 +195,8 @@ public class StoreChangelogReader implements ChangelogReader {
@ -202,6 +204,8 @@ public class StoreChangelogReader implements ChangelogReader {
@@ -202,6 +204,8 @@ public class StoreChangelogReader implements ChangelogReader {
@ -66,7 +66,7 @@ public class GlobalStateManagerStub implements GlobalStateManager {
@@ -66,7 +66,7 @@ public class GlobalStateManagerStub implements GlobalStateManager {