From 96b3c059ac0af8249ed3770728a766af295551b6 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Tue, 9 Jun 2026 15:46:20 +0800 Subject: [PATCH 1/6] Fix schema region visibility race and remove datanode message --- .../IoTDBRemoveDataNodeNormalIT.java | 32 +++++++++ .../confignode/i18n/ProcedureMessages.java | 4 ++ .../confignode/i18n/ProcedureMessages.java | 4 ++ .../manager/partition/PartitionManager.java | 68 ++++++++++++++++++- .../procedure/env/ConfigNodeProcedureEnv.java | 7 +- .../procedure/env/RemoveDataNodeHandler.java | 58 ++++++++++++---- .../region/CreateRegionGroupsProcedure.java | 10 ++- 7 files changed, 165 insertions(+), 18 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java index 4ce111490701a..304424198b6d0 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java @@ -135,6 +135,38 @@ public void tearDown() throws InterruptedException { // ConsensusFactory.IOT_CONSENSUS_V2); // } + @Test + public void failWhenDataReplicationFactorIsOneUseSQL() throws Exception { + EnvFactory.getEnv() + .getConfig() + .getCommonConfig() + .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS) + .setSchemaReplicationFactor(3) + .setDataReplicationFactor(1) + .setDefaultDataRegionGroupNumPerDatabase(1); + EnvFactory.getEnv().initClusterEnvironment(1, 3); + + try (final Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + final Statement statement = makeItCloseQuietly(connection.createStatement()); + final ResultSet resultSet = statement.executeQuery(SHOW_DATANODES)) { + final Set allDataNodeId = new HashSet<>(); + while (resultSet.next()) { + allDataNodeId.add(resultSet.getInt(ColumnHeaderConstant.NODE_ID)); + } + + final String removeDataNodeSQL = + generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1)); + try { + statement.execute(removeDataNodeSQL); + Assert.fail("Remove DataNode should fail when data_replication_factor is 1"); + } catch (final IoTDBSQLException e) { + Assert.assertTrue(e.getMessage(), e.getMessage().contains("data_replication_factor is 1")); + Assert.assertFalse( + e.getMessage(), e.getMessage().contains("Failed to remove all requested data nodes")); + } + } + } + @Test public void fail1C3DTestIoTUseSQL() throws Exception { // Setup 1C3D with schema replication factor = 3, and remove 1D, this test should fail due to diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 085a803777ee8..ddce0ddd9573e 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -445,6 +445,10 @@ public final class ProcedureMessages { "Failed to push topic meta to dataNodes, details: %s"; public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN = "Failed to remove data node {} because it is not in running and the configuration of cluster is one replication"; + + public static final String + FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE = + "Cannot remove DataNode because data_replication_factor is 1 or at least one DataRegion has only one replica. Removing a DataNode may cause data loss. Increase data_replication_factor and ensure each DataRegion has more than one replica before removing DataNodes."; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index d928f4f6bae8b..9b21d61cc7420 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -445,6 +445,10 @@ public final class ProcedureMessages { "Failed to push topic meta to dataNodes, details: %s"; public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN = "Failed to remove data node {} because it is not in running and the configuration of cluster is one replication"; + + public static final String + FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE = + "不能移除 DataNode,因为 data_replication_factor 为 1,或至少存在一个 DataRegion 只有一个副本。移除 DataNode 可能造成数据丢失。请先提高 data_replication_factor,并确保每个 DataRegion 都有多个副本,再移除 DataNode。"; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java index 5be81256b5c39..e4373e773a810 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java @@ -147,6 +147,9 @@ public class PartitionManager { public static final String CONSENSUS_WRITE_ERROR = "Failed in the write API executing the consensus layer due to: "; + private static final long REGION_GROUP_VISIBILITY_TIMEOUT_MS = 10_000L; + private static final long REGION_GROUP_VISIBILITY_CHECK_INTERVAL_MS = 20L; + // Monitor for leadership change private final Object scheduleMonitor = new Object(); @@ -715,12 +718,75 @@ private TSStatus generateAndAllocateRegionGroups( getLoadManager().allocateRegionGroups(allotmentMap, consensusGroupType); LOGGER.info(ManagerMessages.CREATEREGIONGROUPS_STARTING_TO_CREATE_THE_FOLLOWING_REGIONGROUPS); createRegionGroupsPlan.planLog(LOGGER); - return getProcedureManager().createRegionGroups(consensusGroupType, createRegionGroupsPlan); + final TSStatus createStatus = + getProcedureManager().createRegionGroups(consensusGroupType, createRegionGroupsPlan); + if (createStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return createStatus; + } + return waitForRegionGroupsVisible(createRegionGroupsPlan, consensusGroupType); } else { return RpcUtils.SUCCESS_STATUS; } } + private TSStatus waitForRegionGroupsVisible( + final CreateRegionGroupsPlan createRegionGroupsPlan, + final TConsensusGroupType consensusGroupType) { + final Map> expectedRegionGroups = new HashMap<>(); + createRegionGroupsPlan + .getRegionGroupMap() + .forEach( + (database, regionReplicaSets) -> { + final Set regionGroupIds = + regionReplicaSets.stream() + .map(TRegionReplicaSet::getRegionId) + .filter(regionGroupId -> consensusGroupType.equals(regionGroupId.getType())) + .collect(Collectors.toSet()); + if (!regionGroupIds.isEmpty()) { + expectedRegionGroups.put(database, regionGroupIds); + } + }); + + final long startTime = System.currentTimeMillis(); + while (System.currentTimeMillis() - startTime <= REGION_GROUP_VISIBILITY_TIMEOUT_MS) { + if (areRegionGroupsVisible(expectedRegionGroups, consensusGroupType)) { + return RpcUtils.SUCCESS_STATUS; + } + try { + TimeUnit.MILLISECONDS.sleep(REGION_GROUP_VISIBILITY_CHECK_INTERVAL_MS); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode()) + .setMessage( + String.format( + "Interrupted while waiting for created %s RegionGroups %s to become visible in PartitionInfo.", + consensusGroupType, expectedRegionGroups)); + } + } + + final String message = + String.format( + "Created %s RegionGroups %s are not visible in PartitionInfo within %d ms.", + consensusGroupType, expectedRegionGroups, REGION_GROUP_VISIBILITY_TIMEOUT_MS); + LOGGER.warn(message); + return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode()).setMessage(message); + } + + private boolean areRegionGroupsVisible( + final Map> expectedRegionGroups, + final TConsensusGroupType consensusGroupType) { + for (final Map.Entry> entry : expectedRegionGroups.entrySet()) { + final Set visibleRegionGroups = + partitionInfo.getRegionGroupSlotsCounter(entry.getKey(), consensusGroupType).stream() + .map(Pair::getRight) + .collect(Collectors.toSet()); + if (!visibleRegionGroups.containsAll(entry.getValue())) { + return false; + } + } + return true; + } + /** * Only leader use this interface. Checks whether the specified DataPartition has a successor and * returns if it does. diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java index d271d5ef33b9c..4105b8ecf5c2c 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java @@ -511,12 +511,15 @@ public List notifyRegionMigrationToAllDataNodes( return clientHandler.getResponseList(); } - public void persistRegionGroup(CreateRegionGroupsPlan createRegionGroupsPlan) { + public TSStatus persistRegionGroup(CreateRegionGroupsPlan createRegionGroupsPlan) { // Persist the allocation result try { - getConsensusManager().write(createRegionGroupsPlan); + return getConsensusManager().write(createRegionGroupsPlan); } catch (ConsensusException e) { LOG.warn("Failed in the write API executing the consensus layer due to: ", e); + return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode()) + .setMessage( + "Failed to persist RegionGroup allocation in the consensus layer: " + e.getMessage()); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java index 5b505ec001bff..a7235747f71ac 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java @@ -559,6 +559,14 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { TSStatus status = new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); List removedDataNodes = removeDataNodePlan.getDataNodeLocations(); + if (hasSingleDataRegionReplica()) { + status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode()); + status.setMessage( + ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE); + LOGGER.error(status.getMessage()); + return status; + } + int availableDatanodeSize = configManager .getNodeManager() @@ -566,20 +574,26 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { .size(); // when the configuration is one replication, it will be failed if the data node is not in // running state. - if (CONF.getSchemaReplicationFactor() == 1 || CONF.getDataReplicationFactor() == 1) { - for (TDataNodeLocation dataNodeLocation : removedDataNodes) { - // check whether removed data node is in running state - if (!NodeStatus.Running.equals( - configManager.getLoadManager().getNodeStatus(dataNodeLocation.getDataNodeId()))) { - removedDataNodes.remove(dataNodeLocation); - LOGGER.error( - ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN, dataNodeLocation); - } - if (removedDataNodes.isEmpty()) { - status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode()); - status.setMessage(ProcedureMessages.FAILED_TO_REMOVE_ALL_REQUESTED_DATA_NODES); - return status; - } + if (CONF.getSchemaReplicationFactor() == 1) { + final List notRunningDataNodes = + removedDataNodes.stream() + .filter( + dataNodeLocation -> + !NodeStatus.Running.equals( + configManager + .getLoadManager() + .getNodeStatus(dataNodeLocation.getDataNodeId()))) + .collect(Collectors.toList()); + notRunningDataNodes.forEach( + dataNodeLocation -> + LOGGER.error( + ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN, + dataNodeLocation)); + removedDataNodes.removeAll(notRunningDataNodes); + if (removedDataNodes.isEmpty()) { + status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode()); + status.setMessage(ProcedureMessages.FAILED_TO_REMOVE_ALL_REQUESTED_DATA_NODES); + return status; } } @@ -604,6 +618,22 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { return status; } + private boolean hasSingleDataRegionReplica() { + return CONF.getDataReplicationFactor() == 1 + || configManager + .getClusterSchemaManager() + .getMatchedDatabaseSchemasByName( + configManager.getClusterSchemaManager().getDatabaseNames(null), null) + .values() + .stream() + .anyMatch(databaseSchema -> databaseSchema.getDataReplicationFactor() == 1) + || configManager + .getPartitionManager() + .getAllReplicaSets(TConsensusGroupType.DataRegion) + .stream() + .anyMatch(replicaSet -> replicaSet.getDataNodeLocationsSize() <= 1); + } + /** * Checks whether all DataNodes specified for deletion exist in the cluster. * diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java index 2cb283d400eca..e9cce807e77fc 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java @@ -23,7 +23,9 @@ import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.cluster.RegionStatus; +import org.apache.iotdb.commons.exception.IoTDBException; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.commons.utils.ThriftCommonsSerDeUtils; import org.apache.iotdb.confignode.conf.ConfigNodeConfig; @@ -36,10 +38,12 @@ import org.apache.iotdb.confignode.persistence.partition.maintainer.RegionCreateTask; import org.apache.iotdb.confignode.persistence.partition.maintainer.RegionDeleteTask; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; import org.apache.iotdb.confignode.procedure.state.CreateRegionGroupsState; import org.apache.iotdb.confignode.procedure.store.ProcedureType; import org.apache.iotdb.consensus.exception.ConsensusException; +import org.apache.iotdb.rpc.TSStatusCode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -173,7 +177,11 @@ protected Flow executeFromState( } })); - env.persistRegionGroup(persistPlan); + final TSStatus persistStatus = env.persistRegionGroup(persistPlan); + if (persistStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + setFailure(new ProcedureException(new IoTDBException(persistStatus))); + return Flow.NO_MORE_STATE; + } try { env.getConfigManager().getConsensusManager().write(offerPlan); } catch (final ConsensusException e) { From 9c46d01ff02f92b03890bd197f3ae69d879611b9 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Tue, 9 Jun 2026 18:28:20 +0800 Subject: [PATCH 2/6] Replace RegionGroup visibility polling with failure-path diagnostic; clarify RemoveDataNode message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the 10s busy-wait waitForRegionGroupsVisible loop in PartitionManager: the schema-region create/activate/allocate path is unchanged from the PR baseline, so the poll only masks the intermittent "no available RegionGroup" race probabilistically rather than fixing it. Instead, log a single WARN on the failure path (right before throwing NoAvailableRegionGroupException) that dumps every RegionGroup visible in PartitionInfo for the Database and its LoadCache status. This pinpoints, on the next CI repro, whether PartitionInfo has no RegionGroup yet or has some that are all Disabled — without flooding the log, since it only fires when allocation already failed. Also reword FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE (en + zh): drop the misleading data-loss / increase-factor tail and end with "Removing DataNodes is not supported with single replica." --- .../confignode/i18n/ProcedureMessages.java | 2 +- .../confignode/i18n/ProcedureMessages.java | 2 +- .../manager/partition/PartitionManager.java | 86 ++++--------------- 3 files changed, 21 insertions(+), 69 deletions(-) diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index ddce0ddd9573e..83b8d8df2df2e 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -448,7 +448,7 @@ public final class ProcedureMessages { public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE = - "Cannot remove DataNode because data_replication_factor is 1 or at least one DataRegion has only one replica. Removing a DataNode may cause data loss. Increase data_replication_factor and ensure each DataRegion has more than one replica before removing DataNodes."; + "Cannot remove DataNode because data_replication_factor is 1 or at least one DataRegion has only one replica. Removing DataNodes is not supported with single replica."; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 9b21d61cc7420..c4097ea97259a 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -448,7 +448,7 @@ public final class ProcedureMessages { public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE = - "不能移除 DataNode,因为 data_replication_factor 为 1,或至少存在一个 DataRegion 只有一个副本。移除 DataNode 可能造成数据丢失。请先提高 data_replication_factor,并确保每个 DataRegion 都有多个副本,再移除 DataNode。"; + "不能移除 DataNode,因为 data_replication_factor 为 1,或至少存在一个 DataRegion 只有一个副本。单副本不支持移除 DataNode。"; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java index e4373e773a810..dfa3448bc0f3b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java @@ -108,6 +108,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -147,9 +148,6 @@ public class PartitionManager { public static final String CONSENSUS_WRITE_ERROR = "Failed in the write API executing the consensus layer due to: "; - private static final long REGION_GROUP_VISIBILITY_TIMEOUT_MS = 10_000L; - private static final long REGION_GROUP_VISIBILITY_CHECK_INTERVAL_MS = 20L; - // Monitor for leadership change private final Object scheduleMonitor = new Object(); @@ -718,75 +716,12 @@ private TSStatus generateAndAllocateRegionGroups( getLoadManager().allocateRegionGroups(allotmentMap, consensusGroupType); LOGGER.info(ManagerMessages.CREATEREGIONGROUPS_STARTING_TO_CREATE_THE_FOLLOWING_REGIONGROUPS); createRegionGroupsPlan.planLog(LOGGER); - final TSStatus createStatus = - getProcedureManager().createRegionGroups(consensusGroupType, createRegionGroupsPlan); - if (createStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - return createStatus; - } - return waitForRegionGroupsVisible(createRegionGroupsPlan, consensusGroupType); + return getProcedureManager().createRegionGroups(consensusGroupType, createRegionGroupsPlan); } else { return RpcUtils.SUCCESS_STATUS; } } - private TSStatus waitForRegionGroupsVisible( - final CreateRegionGroupsPlan createRegionGroupsPlan, - final TConsensusGroupType consensusGroupType) { - final Map> expectedRegionGroups = new HashMap<>(); - createRegionGroupsPlan - .getRegionGroupMap() - .forEach( - (database, regionReplicaSets) -> { - final Set regionGroupIds = - regionReplicaSets.stream() - .map(TRegionReplicaSet::getRegionId) - .filter(regionGroupId -> consensusGroupType.equals(regionGroupId.getType())) - .collect(Collectors.toSet()); - if (!regionGroupIds.isEmpty()) { - expectedRegionGroups.put(database, regionGroupIds); - } - }); - - final long startTime = System.currentTimeMillis(); - while (System.currentTimeMillis() - startTime <= REGION_GROUP_VISIBILITY_TIMEOUT_MS) { - if (areRegionGroupsVisible(expectedRegionGroups, consensusGroupType)) { - return RpcUtils.SUCCESS_STATUS; - } - try { - TimeUnit.MILLISECONDS.sleep(REGION_GROUP_VISIBILITY_CHECK_INTERVAL_MS); - } catch (final InterruptedException e) { - Thread.currentThread().interrupt(); - return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode()) - .setMessage( - String.format( - "Interrupted while waiting for created %s RegionGroups %s to become visible in PartitionInfo.", - consensusGroupType, expectedRegionGroups)); - } - } - - final String message = - String.format( - "Created %s RegionGroups %s are not visible in PartitionInfo within %d ms.", - consensusGroupType, expectedRegionGroups, REGION_GROUP_VISIBILITY_TIMEOUT_MS); - LOGGER.warn(message); - return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode()).setMessage(message); - } - - private boolean areRegionGroupsVisible( - final Map> expectedRegionGroups, - final TConsensusGroupType consensusGroupType) { - for (final Map.Entry> entry : expectedRegionGroups.entrySet()) { - final Set visibleRegionGroups = - partitionInfo.getRegionGroupSlotsCounter(entry.getKey(), consensusGroupType).stream() - .map(Pair::getRight) - .collect(Collectors.toSet()); - if (!visibleRegionGroups.containsAll(entry.getValue())) { - return false; - } - } - return true; - } - /** * Only leader use this interface. Checks whether the specified DataPartition has a successor and * returns if it does. @@ -1059,6 +994,23 @@ public List> getSortedRegionGroupSlotsCounter( } if (result.isEmpty()) { + // Diagnostic for the intermittent "no available RegionGroup" CI failures: dump every + // RegionGroup visible in PartitionInfo for this Database together with its LoadCache status. + // This pinpoints whether PartitionInfo simply has no RegionGroup yet (newly created + // RegionGroup not exposed) or it has some but all of them are currently Disabled. + // Only logged on the failure path right before throwing, so it never floods the log. + final Map visibleRegionGroupStatusMap = + new LinkedHashMap<>(); + regionGroupSlotsCounter.forEach( + slotsCounter -> + visibleRegionGroupStatusMap.put( + slotsCounter.getRight(), + getLoadManager().getRegionGroupStatus(slotsCounter.getRight()))); + LOGGER.warn( + "No available {} RegionGroup for Database: {}. RegionGroups visible in PartitionInfo and their LoadCache status: {}", + type, + database, + visibleRegionGroupStatusMap); throw new NoAvailableRegionGroupException(type, Collections.singletonList(database)); } From d7a7291be81e00850677e1e03d50ccacdc14b4dd Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Wed, 10 Jun 2026 13:53:55 +0800 Subject: [PATCH 3/6] Allow removing DataNode under single replica unless it is the last one The previous guard rejected removing any DataNode whenever the cluster kept a single replica (data_replication_factor == 1), which broke IoTDBClusterNodeGetterIT.queryAndRemoveDataNodeTest (2C2D, single replica, removing 1 of 2 DataNodes is legal and must return SUCCESS). Drop the blanket hasSingleDataRegionReplica() guard and reuse the existing capacity check: removal is rejected only when it would leave fewer than NodeInfo.getMinimumDataNode() DataNodes. Under a true single replica (MINIMUM_DATANODE == max(schema, data) == 1) that means only the last remaining DataNode cannot be removed, with a dedicated message. Updated the failing-path IT accordingly. --- .../IoTDBRemoveDataNodeNormalIT.java | 17 ++++--- .../confignode/i18n/ProcedureMessages.java | 4 +- .../confignode/i18n/ProcedureMessages.java | 4 +- .../procedure/env/RemoveDataNodeHandler.java | 48 +++++++------------ 4 files changed, 31 insertions(+), 42 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java index 304424198b6d0..7f40ac3d27c88 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java @@ -136,15 +136,19 @@ public void tearDown() throws InterruptedException { // } @Test - public void failWhenDataReplicationFactorIsOneUseSQL() throws Exception { + public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { + // With a single replica (schema_replication_factor and data_replication_factor are both 1), + // removing DataNodes is still supported as long as more than one DataNode remains, but the last + // remaining DataNode cannot be removed because there is nowhere to migrate its regions to. + // Here we set up 1C2D with single replica and try to remove both DataNodes, which must fail. EnvFactory.getEnv() .getConfig() .getCommonConfig() .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS) - .setSchemaReplicationFactor(3) + .setSchemaReplicationFactor(1) .setDataReplicationFactor(1) .setDefaultDataRegionGroupNumPerDatabase(1); - EnvFactory.getEnv().initClusterEnvironment(1, 3); + EnvFactory.getEnv().initClusterEnvironment(1, 2); try (final Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); final Statement statement = makeItCloseQuietly(connection.createStatement()); @@ -155,12 +159,13 @@ public void failWhenDataReplicationFactorIsOneUseSQL() throws Exception { } final String removeDataNodeSQL = - generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1)); + generateRemoveString(selectRemoveDataNodes(allDataNodeId, allDataNodeId.size())); try { statement.execute(removeDataNodeSQL); - Assert.fail("Remove DataNode should fail when data_replication_factor is 1"); + Assert.fail( + "Remove DataNode should fail when it would leave no DataNode under single replica"); } catch (final IoTDBSQLException e) { - Assert.assertTrue(e.getMessage(), e.getMessage().contains("data_replication_factor is 1")); + Assert.assertTrue(e.getMessage(), e.getMessage().contains("single replica")); Assert.assertFalse( e.getMessage(), e.getMessage().contains("Failed to remove all requested data nodes")); } diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 83b8d8df2df2e..8a61c0aa8b6ab 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -447,8 +447,8 @@ public final class ProcedureMessages { "Failed to remove data node {} because it is not in running and the configuration of cluster is one replication"; public static final String - FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE = - "Cannot remove DataNode because data_replication_factor is 1 or at least one DataRegion has only one replica. Removing DataNodes is not supported with single replica."; + FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_THE_LAST_SINGLE_REPLICA = + "Cannot remove the last DataNode when the cluster keeps a single replica (schema_replication_factor and data_replication_factor are both 1). With a single replica there is nowhere to migrate the regions to, so removing DataNodes is only supported when more than one DataNode remains."; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index c4097ea97259a..33cf695f0aaed 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -447,8 +447,8 @@ public final class ProcedureMessages { "Failed to remove data node {} because it is not in running and the configuration of cluster is one replication"; public static final String - FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE = - "不能移除 DataNode,因为 data_replication_factor 为 1,或至少存在一个 DataRegion 只有一个副本。单副本不支持移除 DataNode。"; + FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_THE_LAST_SINGLE_REPLICA = + "集群为单副本(schema_replication_factor 与 data_replication_factor 均为 1)时,不能移除最后一个 DataNode:单副本下没有其它节点可供迁移 Region,因此仅当移除后仍保留多于一个 DataNode 时才支持移除 DataNode。"; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java index a7235747f71ac..4a6f9edf68fef 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java @@ -559,14 +559,6 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { TSStatus status = new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); List removedDataNodes = removeDataNodePlan.getDataNodeLocations(); - if (hasSingleDataRegionReplica()) { - status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode()); - status.setMessage( - ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_DATA_REPLICATION_FACTOR_IS_ONE); - LOGGER.error(status.getMessage()); - return status; - } - int availableDatanodeSize = configManager .getNodeManager() @@ -574,7 +566,7 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { .size(); // when the configuration is one replication, it will be failed if the data node is not in // running state. - if (CONF.getSchemaReplicationFactor() == 1) { + if (CONF.getSchemaReplicationFactor() == 1 || CONF.getDataReplicationFactor() == 1) { final List notRunningDataNodes = removedDataNodes.stream() .filter( @@ -607,33 +599,25 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { .count(); if (availableDatanodeSize - removedDataNodeSize < NodeInfo.getMinimumDataNode()) { status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode()); - status.setMessage( - String.format( - "Can't remove datanode due to the limit of replication factor, " - + "availableDataNodeSize: %s, maxReplicaFactor: %s, max allowed removed Data Node size is: %s", - availableDatanodeSize, - NodeInfo.getMinimumDataNode(), - (availableDatanodeSize - NodeInfo.getMinimumDataNode()))); + if (NodeInfo.getMinimumDataNode() == 1) { + // With a single replica (schema_replication_factor and data_replication_factor are both 1) + // the only copy of each region lives on one DataNode, so the last remaining DataNode cannot + // be removed: there is nowhere to migrate its regions to. + status.setMessage( + ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_THE_LAST_SINGLE_REPLICA); + } else { + status.setMessage( + String.format( + "Can't remove datanode due to the limit of replication factor, " + + "availableDataNodeSize: %s, maxReplicaFactor: %s, max allowed removed Data Node size is: %s", + availableDatanodeSize, + NodeInfo.getMinimumDataNode(), + (availableDatanodeSize - NodeInfo.getMinimumDataNode()))); + } } return status; } - private boolean hasSingleDataRegionReplica() { - return CONF.getDataReplicationFactor() == 1 - || configManager - .getClusterSchemaManager() - .getMatchedDatabaseSchemasByName( - configManager.getClusterSchemaManager().getDatabaseNames(null), null) - .values() - .stream() - .anyMatch(databaseSchema -> databaseSchema.getDataReplicationFactor() == 1) - || configManager - .getPartitionManager() - .getAllReplicaSets(TConsensusGroupType.DataRegion) - .stream() - .anyMatch(replicaSet -> replicaSet.getDataNodeLocationsSize() <= 1); - } - /** * Checks whether all DataNodes specified for deletion exist in the cluster. * From 782f7569dc58050b0d66f44691d2d326954b8a3a Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Wed, 10 Jun 2026 15:49:42 +0800 Subject: [PATCH 4/6] Fix failWhenRemovingLastSingleReplicaDataNodeUseSQL: use 1C1D single-node removal The previous test set up 1C2D and removed both DataNodes via `remove datanode 2, 3`, but the REMOVE DATANODE grammar accepts only a single INTEGER_LITERAL, so the SQL failed to parse (`mismatched input ','`) instead of hitting the capacity check. The test then asserted on the wrong exception and failed in CI (Cluster IT - 1C3D / Simple). Set up 1C1D and remove the only DataNode, producing a valid single-id `remove datanode `. Removing it leaves 0 < MINIMUM_DATANODE (1), so checkRegionReplication rejects it with the single-replica message, matching the assertions. Verified locally with -PClusterIT: Tests run: 1, Failures: 0. --- .../it/removedatanode/IoTDBRemoveDataNodeNormalIT.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java index 7f40ac3d27c88..ea9ed17efbc76 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java @@ -140,7 +140,8 @@ public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { // With a single replica (schema_replication_factor and data_replication_factor are both 1), // removing DataNodes is still supported as long as more than one DataNode remains, but the last // remaining DataNode cannot be removed because there is nowhere to migrate its regions to. - // Here we set up 1C2D with single replica and try to remove both DataNodes, which must fail. + // Here we set up 1C1D with single replica and try to remove the only DataNode, which must fail + // because removing it would leave the cluster with no DataNode. EnvFactory.getEnv() .getConfig() .getCommonConfig() @@ -148,7 +149,7 @@ public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { .setSchemaReplicationFactor(1) .setDataReplicationFactor(1) .setDefaultDataRegionGroupNumPerDatabase(1); - EnvFactory.getEnv().initClusterEnvironment(1, 2); + EnvFactory.getEnv().initClusterEnvironment(1, 1); try (final Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); final Statement statement = makeItCloseQuietly(connection.createStatement()); @@ -159,7 +160,7 @@ public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { } final String removeDataNodeSQL = - generateRemoveString(selectRemoveDataNodes(allDataNodeId, allDataNodeId.size())); + generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1)); try { statement.execute(removeDataNodeSQL); Assert.fail( From 047c0a5eb599fe1914f331cb8d1d48a0ac6a492d Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Wed, 10 Jun 2026 16:29:06 +0800 Subject: [PATCH 5/6] Unify RemoveDataNode capacity-rejection message with concrete numbers The previous code special-cased only the single-replica (MINIMUM_DATANODE == 1) "last DataNode" case and otherwise emitted a generic message whose label "maxReplicaFactor" was misleading and which never showed how many DataNodes the request actually tried to remove. Replace the if/else with one parameterized message that always reports the gap: how many DataNodes are being removed, how many are available, the minimum that must remain (max(schema, data) replication factor, with both factors spelled out) and how many would be left. When the minimum is 1, append a single-replica hint explaining at least one DataNode must always remain. Rename the i18n constant to FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW (now a format string) and add FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT, both en and zh. Strengthened the failing-path IT to assert the unified message ("Cannot remove" + "single replica"). Verified with -PClusterIT: Tests run: 1, Failures: 0; ConfigNode log shows the rendered message. --- .../IoTDBRemoveDataNodeNormalIT.java | 3 ++ .../confignode/i18n/ProcedureMessages.java | 7 +++-- .../confignode/i18n/ProcedureMessages.java | 7 +++-- .../procedure/env/RemoveDataNodeHandler.java | 28 +++++++++++-------- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java index ea9ed17efbc76..6fcf34f765b60 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java @@ -166,6 +166,9 @@ public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { Assert.fail( "Remove DataNode should fail when it would leave no DataNode under single replica"); } catch (final IoTDBSQLException e) { + // The unified rejection message reports the gap and, for a single replica, appends the + // "at least one DataNode must always remain" hint. + Assert.assertTrue(e.getMessage(), e.getMessage().contains("Cannot remove")); Assert.assertTrue(e.getMessage(), e.getMessage().contains("single replica")); Assert.assertFalse( e.getMessage(), e.getMessage().contains("Failed to remove all requested data nodes")); diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 8a61c0aa8b6ab..a43e5151d4042 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -446,9 +446,10 @@ public final class ProcedureMessages { public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN = "Failed to remove data node {} because it is not in running and the configuration of cluster is one replication"; - public static final String - FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_THE_LAST_SINGLE_REPLICA = - "Cannot remove the last DataNode when the cluster keeps a single replica (schema_replication_factor and data_replication_factor are both 1). With a single replica there is nowhere to migrate the regions to, so removing DataNodes is only supported when more than one DataNode remains."; + public static final String FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW = + "Cannot remove %d DataNode(s): the cluster has %d available DataNode(s) and must retain at least %d of them (max(schema_replication_factor=%d, data_replication_factor=%d)) so that every region keeps enough replicas, but this request would leave only %d."; + public static final String FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT = + " With a single replica there is nowhere to migrate regions to, so at least one DataNode must always remain."; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index 33cf695f0aaed..dbad526cce7b8 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -446,9 +446,10 @@ public final class ProcedureMessages { public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN = "Failed to remove data node {} because it is not in running and the configuration of cluster is one replication"; - public static final String - FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_THE_LAST_SINGLE_REPLICA = - "集群为单副本(schema_replication_factor 与 data_replication_factor 均为 1)时,不能移除最后一个 DataNode:单副本下没有其它节点可供迁移 Region,因此仅当移除后仍保留多于一个 DataNode 时才支持移除 DataNode。"; + public static final String FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW = + "无法移除 %d 个 DataNode:集群当前有 %d 个可用 DataNode,且至少需保留 %d 个(max(schema_replication_factor=%d, data_replication_factor=%d)),以保证每个 Region 仍有足够的副本;但本次请求执行后将只剩 %d 个。"; + public static final String FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT = + " 单副本下没有其它节点可供迁移 Region,因此必须始终保留至少一个 DataNode。"; public static final String FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED = "Failed to rollback alter pipe {}, details: {}, metadata will be synchronized later."; public static final String FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java index 4a6f9edf68fef..6782c1b652a3b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java @@ -599,21 +599,25 @@ public TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) { .count(); if (availableDatanodeSize - removedDataNodeSize < NodeInfo.getMinimumDataNode()) { status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode()); + // Report the concrete numbers so operators can see the gap: how many DataNodes are being + // removed, how many are available, the minimum that must remain (the larger of the schema and + // data replication factors) and how many would be left. + String message = + String.format( + ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW, + removedDataNodeSize, + availableDatanodeSize, + NodeInfo.getMinimumDataNode(), + CONF.getSchemaReplicationFactor(), + CONF.getDataReplicationFactor(), + availableDatanodeSize - removedDataNodeSize); if (NodeInfo.getMinimumDataNode() == 1) { // With a single replica (schema_replication_factor and data_replication_factor are both 1) - // the only copy of each region lives on one DataNode, so the last remaining DataNode cannot - // be removed: there is nowhere to migrate its regions to. - status.setMessage( - ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_THE_LAST_SINGLE_REPLICA); - } else { - status.setMessage( - String.format( - "Can't remove datanode due to the limit of replication factor, " - + "availableDataNodeSize: %s, maxReplicaFactor: %s, max allowed removed Data Node size is: %s", - availableDatanodeSize, - NodeInfo.getMinimumDataNode(), - (availableDatanodeSize - NodeInfo.getMinimumDataNode()))); + // the only copy of each region lives on one DataNode, so at least one DataNode must always + // remain: there is nowhere to migrate its regions to. + message += ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT; } + status.setMessage(message); } return status; } From 70bee2719f34871f8be3f3660e57c5602be3dea6 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Wed, 10 Jun 2026 17:22:07 +0800 Subject: [PATCH 6/6] Move the last-DataNode removal test to its own 1C1D suite class failWhenRemovingLastSingleReplicaDataNodeUseSQL only needs a 1C1D cluster, but it lived in IoTDBRemoveDataNodeNormalIT which is @Category({ClusterIT.class}) and therefore ran in the 1C3D suite alongside the multi-DataNode removal tests. Extract it into a new IoTDBRemoveLastDataNodeIT annotated @Category({LocalStandaloneIT.class}) so it runs in the 1C1D (Simple) suite, isolated from the cluster tests. Verified with the default with-integration-tests profile (no -PClusterIT): Tests run: 1, Failures: 0 on a 1C1D cluster. --- .../IoTDBRemoveDataNodeNormalIT.java | 41 ------- .../IoTDBRemoveLastDataNodeIT.java | 111 ++++++++++++++++++ 2 files changed, 111 insertions(+), 41 deletions(-) create mode 100644 integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java index 6fcf34f765b60..4ce111490701a 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveDataNodeNormalIT.java @@ -135,47 +135,6 @@ public void tearDown() throws InterruptedException { // ConsensusFactory.IOT_CONSENSUS_V2); // } - @Test - public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { - // With a single replica (schema_replication_factor and data_replication_factor are both 1), - // removing DataNodes is still supported as long as more than one DataNode remains, but the last - // remaining DataNode cannot be removed because there is nowhere to migrate its regions to. - // Here we set up 1C1D with single replica and try to remove the only DataNode, which must fail - // because removing it would leave the cluster with no DataNode. - EnvFactory.getEnv() - .getConfig() - .getCommonConfig() - .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS) - .setSchemaReplicationFactor(1) - .setDataReplicationFactor(1) - .setDefaultDataRegionGroupNumPerDatabase(1); - EnvFactory.getEnv().initClusterEnvironment(1, 1); - - try (final Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); - final Statement statement = makeItCloseQuietly(connection.createStatement()); - final ResultSet resultSet = statement.executeQuery(SHOW_DATANODES)) { - final Set allDataNodeId = new HashSet<>(); - while (resultSet.next()) { - allDataNodeId.add(resultSet.getInt(ColumnHeaderConstant.NODE_ID)); - } - - final String removeDataNodeSQL = - generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1)); - try { - statement.execute(removeDataNodeSQL); - Assert.fail( - "Remove DataNode should fail when it would leave no DataNode under single replica"); - } catch (final IoTDBSQLException e) { - // The unified rejection message reports the gap and, for a single replica, appends the - // "at least one DataNode must always remain" hint. - Assert.assertTrue(e.getMessage(), e.getMessage().contains("Cannot remove")); - Assert.assertTrue(e.getMessage(), e.getMessage().contains("single replica")); - Assert.assertFalse( - e.getMessage(), e.getMessage().contains("Failed to remove all requested data nodes")); - } - } - } - @Test public void fail1C3DTestIoTUseSQL() throws Exception { // Setup 1C3D with schema replication factor = 3, and remove 1D, this test should fail due to diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java new file mode 100644 index 0000000000000..ca5a2929fd681 --- /dev/null +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.it.removedatanode; + +import org.apache.iotdb.commons.schema.column.ColumnHeaderConstant; +import org.apache.iotdb.consensus.ConsensusFactory; +import org.apache.iotdb.it.env.EnvFactory; +import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.LocalStandaloneIT; +import org.apache.iotdb.jdbc.IoTDBSQLException; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; + +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.HashSet; +import java.util.Set; + +import static org.apache.iotdb.confignode.it.removedatanode.IoTDBRemoveDataNodeUtils.generateRemoveString; +import static org.apache.iotdb.confignode.it.removedatanode.IoTDBRemoveDataNodeUtils.selectRemoveDataNodes; +import static org.apache.iotdb.util.MagicUtils.makeItCloseQuietly; + +/** + * Removing the last DataNode of a single-replica cluster must be rejected. This only needs a 1C1D + * cluster, so it lives in the 1C1D (LocalStandaloneIT) suite, separate from the multi-DataNode + * removal tests in {@link IoTDBRemoveDataNodeNormalIT}. + */ +@Category({LocalStandaloneIT.class}) +@RunWith(IoTDBTestRunner.class) +public class IoTDBRemoveLastDataNodeIT { + + private static final String SHOW_DATANODES = "show datanodes"; + + @Before + public void setUp() throws Exception { + EnvFactory.getEnv() + .getConfig() + .getCommonConfig() + .setConfigNodeConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS) + .setSchemaRegionConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS); + } + + @After + public void tearDown() throws InterruptedException { + EnvFactory.getEnv().cleanClusterEnvironment(); + } + + @Test + public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws Exception { + // With a single replica (schema_replication_factor and data_replication_factor are both 1), + // removing DataNodes is still supported as long as more than one DataNode remains, but the last + // remaining DataNode cannot be removed because there is nowhere to migrate its regions to. + // Here we set up 1C1D with single replica and try to remove the only DataNode, which must fail + // because removing it would leave the cluster with no DataNode. + EnvFactory.getEnv() + .getConfig() + .getCommonConfig() + .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS) + .setSchemaReplicationFactor(1) + .setDataReplicationFactor(1) + .setDefaultDataRegionGroupNumPerDatabase(1); + EnvFactory.getEnv().initClusterEnvironment(1, 1); + + try (final Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + final Statement statement = makeItCloseQuietly(connection.createStatement()); + final ResultSet resultSet = statement.executeQuery(SHOW_DATANODES)) { + final Set allDataNodeId = new HashSet<>(); + while (resultSet.next()) { + allDataNodeId.add(resultSet.getInt(ColumnHeaderConstant.NODE_ID)); + } + + final String removeDataNodeSQL = + generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1)); + try { + statement.execute(removeDataNodeSQL); + Assert.fail( + "Remove DataNode should fail when it would leave no DataNode under single replica"); + } catch (final IoTDBSQLException e) { + // The unified rejection message reports the gap and, for a single replica, appends the + // "at least one DataNode must always remain" hint. + Assert.assertTrue(e.getMessage(), e.getMessage().contains("Cannot remove")); + Assert.assertTrue(e.getMessage(), e.getMessage().contains("single replica")); + Assert.assertFalse( + e.getMessage(), e.getMessage().contains("Failed to remove all requested data nodes")); + } + } + } +}