This setting controls how much DN volumes are allowed to differ in terms of bytes of free disk space before they are considered imbalanced. If the free space of all the volumes are within this range of each other, the volumes will be considered balanced and block assignments will be done on a pure round robin basis.
ublic boolean areAllVolumesWithinFreeSpaceThreshold() {
long leastAvailable = Long.MAX_VALUE;
long mostAvailable = 0;
for (AvailableSpaceVolumePair volume : volumes) {
leastAvailable = Math.min(leastAvailable, volume.getAvailable());
mostAvailable = Math.max(mostAvailable, volume.getAvailable());
return (mostAvailable - leastAvailable) < balancedSpaceThreshold;
This setting controls what percentage of new block allocations will be sent to volumes with more available disk space than others. This setting should be in the range 0.0 - 1.0, though in practice 0.5 - 1.0, since there should be no reason to prefer that volumes with
/** * This interface specifies the policy for choosing volumes to store replicas. */@InterfaceAudience.Privatepublic interface VolumeChoosingPolicy<V extends FsVolumeSpi> { /** * Choose a volume to place a replica, * given a list of volumes and the replica size sought for storage. * * The implementations of this interface must be thread-safe. * * @param volumes - a list of available volumes. * @param replicaSize - the size of the replica for which a volume is sought. * @return the chosen volume. * @throws IOException when disks are unavailable or are full. */ public V chooseVolume(List<V> volumes, long replicaSize) throws IOException;}
/** * Choose volumes in round-robin order. */public class RoundRobinVolumeChoosingPolicy<V extends FsVolumeSpi> implements VolumeChoosingPolicy<V> { public static final Log LOG = LogFactory.getLog(RoundRobinVolumeChoosingPolicy.class); private int curVolume = 0; @Override public synchronized V chooseVolume(final List<V> volumes, long blockSize) throws IOException {//同步调用,datanode每次需要调用选择磁盘进行块写入时调用 if(volumes.size() < 1) { throw new DiskOutOfSpaceException("No more available volumes"); } // since volumes could've been removed because of the failure // make sure we are not out of bounds if(curVolume >= volumes.size()) { curVolume = 0; } int startVolume = curVolume; long maxAvailable = 0; while (true) { final V volume = volumes.get(curVolume); curVolume = (curVolume + 1) % volumes.size();//每次从当前volume加一,然后判断可用空间是否足够 long availableVolumeSize = volume.getAvailable(); if (availableVolumeSize > blockSize) { return volume; } if (availableVolumeSize > maxAvailable) { maxAvailable = availableVolumeSize; } if (curVolume == startVolume) { throw new DiskOutOfSpaceException("Out of space: " + "The volume with the most available space (=" + maxAvailable + " B) is less than the block size (=" + blockSize + " B)."); } } }}
public synchronized V chooseVolume(List<V> volumes, long replicaSize) throws IOException { if (volumes.size() < 1) { throw new DiskOutOfSpaceException("No more available volumes"); } AvailableSpaceVolumeList volumesWithSpaces = new AvailableSpaceVolumeList(volumes);//先从volumes包装成AvailableSpaceVolumeList,然后下面进行判断 if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) {//全部volume中空间最大减去最小不超过参数阈值设置,则调用轮询的策略 // If they're actually not too far out of whack, fall back on pure round // robin. V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize); if (LOG.isDebugEnabled()) { LOG.debug("All volumes are within the configured free space balance " + "threshold. Selecting " + volume + " for write of block size " + replicaSize); } return volume; } else {//全部volume中空间最大减去最小超过参数阈值设置 V volume = null; // If none of the volumes with low free space have enough space for the // replica, always try to choose a volume with a lot of free space. long mostAvailableAmongLowVolumes = volumesWithSpaces .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace(); //先从volume中去的最少可用空间和最大可用空间。//"最小可用空间"到"最小可用空间+阈值"之间的为lowAvailableVolumes//"最小可用空间+阈值"到"最大可用空间"之间的为highAvailableVolumes//举例:volumes中最小可用空间为100G,最大可用空间为150G,阈值设置为20G,则100G~120G的volumes为lowAvailableVolumes,120G~150G的volumes为highAvailableVolumes。 List<V> highAvailableVolumes = extractVolumesFromPairs( volumesWithSpaces.getVolumesWithHighAvailableSpace()); List<V> lowAvailableVolumes = extractVolumesFromPairs( volumesWithSpaces.getVolumesWithLowAvailableSpace());// float preferencePercentScaler = (highAvailableVolumes.size() * balancedPreferencePercent) + (lowAvailableVolumes.size() * (1 - balancedPreferencePercent)); float scaledPreferencePercent = (highAvailableVolumes.size() * balancedPreferencePercent) / preferencePercentScaler;//选择在highAvailableVolumes中进行轮询的条件有两个之一即可://1、在lowAvailableVolumes中,如果最大可用空间小于块大小;//2、满足公式random.nextFloat() < (highAvailableVolumes.size() * balancedPreferencePercent)/(highAvailableVolumes.size() * balancedPreferencePercent) + (lowAvailableVolumes.size() * (1 - balancedPreferencePercent))//我们举例来进行描述,受限最小可用空间为100G,最大可用空间为150G,阈值为20G,balancedPreferencePercent阈值为默认0.75.下面用5个盘来说明,一步步解析上面的公式。// 100G 110G 130G 140G 150G//|<- lowAvailableVolumes ->|<- highAvailableVolumes ->| //highAvailableVolumes.size() = 3//lowAvailableVolumes.size() = 2//balancedPreferencePercent=0.75//(highAvailableVolumes.size() * balancedPreferencePercent)/(highAvailableVolumes.size() * balancedPreferencePercent) + (lowAvailableVolumes.size() * (1 - balancedPreferencePercent))//(3 * 0.75 )/(3 * 0.75 ) + (2 * (1 - 0.75 ))//这个公式其实根据balancedPreferencePercent在0到1之间的取值,生成0到1的值。分子是highAvailableVolumes.size()*balancedPreferencePercent所以,当balancedPreferencePercent越大时,那么公式最终值越接近1.//random.nextFloat()取值为0~1,若公式最终值越接近1,那么random.nextFloat() <的可能性越高,也就越会进行highAvailableVolumes进行轮询写入块了。 if (mostAvailableAmongLowVolumes < replicaSize || random.nextFloat() < scaledPreferencePercent) { volume = roundRobinPolicyHighAvailable.chooseVolume( highAvailableVolumes, replicaSize); if (LOG.isDebugEnabled()) { LOG.debug("Volumes are imbalanced. Selecting " + volume + " from high available space volumes for write of block size " + replicaSize); } } else { volume = roundRobinPolicyLowAvailable.chooseVolume( lowAvailableVolumes, replicaSize); if (LOG.isDebugEnabled()) { LOG.debug("Volumes are imbalanced. Selecting " + volume + " from low available space volumes for write of block size " + replicaSize); } } return volume; }}