本文主要通过一个crushmap的例子,来探讨crushmap将PG映射到OSD的过程。
1. 生成crushmap.bin
我们有如下crushmap.txt:
[root@localhost ceph-test]# cat crushmap.txt
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable straw_calc_version 1
# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5
device 6 osd.6
device 7 osd.7
device 8 osd.8
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
type 11 osd-domain
type 12 host-domain
type 13 replica-domain
type 14 failure-domain
# buckets
host node7-1 {
id -2 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item osd.0 weight 0.150
item osd.1 weight 0.150
item osd.2 weight 0.150
}
rack rack-01 {
id -3 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item node7-1 weight 0.450
}
host node7-2 {
id -4 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item osd.3 weight 0.150
item osd.4 weight 0.150
item osd.5 weight 0.150
}
rack rack-02 {
id -5 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item node7-2 weight 0.450
}
host node7-3 {
id -6 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item osd.6 weight 0.150
item osd.7 weight 0.150
item osd.8 weight 0.150
}
rack rack-03 {
id -7 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item node7-3 weight 0.450
}
root default {
id -1 # do not change unnecessarily
# weight 1.350
alg straw
hash 0 # rjenkins1
item rack-01 weight 0.450
item rack-02 weight 0.450
item rack-03 weight 0.450
}
host-domain host-group-0-rack-01 {
id -8 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item node7-1 weight 0.450
}
host-domain host-group-0-rack-02 {
id -11 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item node7-2 weight 0.450
}
host-domain host-group-0-rack-03 {
id -12 # do not change unnecessarily
# weight 0.450
alg straw
hash 0 # rjenkins1
item node7-3 weight 0.450
}
replica-domain replica-0 {
id -9 # do not change unnecessarily
# weight 1.350
alg straw
hash 0 # rjenkins1
item host-group-0-rack-01 weight 0.450
item host-group-0-rack-02 weight 0.450
item host-group-0-rack-03 weight 0.450
}
failure-domain sata-00 {
id -10 # do not change unnecessarily
# weight 1.350
alg straw
hash 0 # rjenkins1
item replica-0 weight 1.350
}
# rules
rule replicated_ruleset {
ruleset 0
type replicated
min_size 1
max_size 10
step take default
step choose firstn 0 type osd
step emit
}
rule replicated_rule-5 {
ruleset 5
type replicated
min_size 1
max_size 10
step take sata-00
step choose firstn 1 type replica-domain
step chooseleaf firstn 0 type host-domain
step emit
}
# end crush map
调用如下命令生成crushmap.bin:
整个crushmap的层级结构如下:
[root@localhost ceph-test]# crushtool --test -i crushmap-new.bin --tree
WARNING: no output selected; use --output-csv or --show-X
ID WEIGHT TYPE NAME
-10 1.34999 failure-domain sata-00
-9 1.34999 replica-domain replica-0
-8 0.45000 host-domain host-group-0-rack-01
-2 0.45000 host node7-1
0 0.14999 osd.0
1 0.14999 osd.1
2 0.14999 osd.2
-11 0.45000 host-domain host-group-0-rack-02
-4 0.45000 host node7-2
3 0.14999 osd.3
4 0.14999 osd.4
5 0.14999 osd.5
-12 0.45000 host-domain host-group-0-rack-03
-6 0.45000 host node7-3
6 0.14999 osd.6
7 0.14999 osd.7
8 0.14999 osd.8
-1 1.34999 root default
-3 0.45000 rack rack-01
-2 0.45000 host node7-1
0 0.14999 osd.0
1 0.14999 osd.1
2 0.14999 osd.2
-5 0.45000 rack rack-02
-4 0.45000 host node7-2
3 0.14999 osd.3
4 0.14999 osd.4
5 0.14999 osd.5
-7 0.45000 rack rack-03
-6 0.45000 host node7-3
6 0.14999 osd.6
7 0.14999 osd.7
8 0.14999 osd.8
2. 测试PG映射到OSD的过程
如下我们使用crushtool工具来测试PG到OSD的映射。上面我们有两个rule,其对应的ruleset分别是ruleset 0与ruleset 5。
注意: 这里如果不指定min_x与max_x,则系统默认会映射[0,1023]这1024个PG
如下我们采用ruleset 5映射PG 0~PG 10
:
[root@localhost ceph-test]# crushtool --test -i crushmap-new.bin --show-mappings --ruleset 5 --num-rep=3 --min_x=0 --max_x=10
CRUSH rule 1 x 0 [3,0,7]
CRUSH rule 1 x 1 [5,0,7]
CRUSH rule 1 x 2 [8,3,1]
CRUSH rule 1 x 3 [8,0,4]
CRUSH rule 1 x 4 [1,4,7]
CRUSH rule 1 x 5 [3,8,0]
CRUSH rule 1 x 6 [3,6,1]
CRUSH rule 1 x 7 [5,8,2]
CRUSH rule 1 x 8 [7,5,0]
CRUSH rule 1 x 9 [8,3,1]
CRUSH rule 1 x 10 [4,0,8]
3. 源代码分析
下面我们结合crushtool源代码来分析上述命令的执行过程,以进一步了解crushmap.
3.1 解析test参数
表明以test方式运行,在这里--test
参数是必须的,否则将不能执行到我们的映射函数:
3.2 解析i参数
参数-i指定输入的crushmap.bin文件,crushtool工具需要通过解析该文件来获取crushmap对象:
这里crush.decode(p)
是《crushmap详解-1》中crushmap 编码的一个逆过程,这里不做详细解释。
3.3 解析show-mappings参数
参数--show-mappings
告诉crushtool将PG->OSD的映射打印出来。
3.4 解析ruleset参数
3.5 解析num-rep参数
参数num-rep指定数据的副本数,直接关系到一个PG会映射到多少个OSD上。
3.6 解析min_x与max_x参数
通过这两个参数来指定映射哪个范围的内PG,如果未指定,后续可以看到默认会映射PG0~PG1023
范围内的所有PG。
3.7 主要函数分析
上述命令的主要执行过程为如下函数:
1) 判断min_rule,max_rule,min_x,max_x等的值
这里我们在传参时没有设置min_rule,max_rule,因此这里min_rule被设置为0,max_rule被设置为1(crush.get_max_rules() -1
);而min_x与max_x则直接为我们传入的参数值0,10.
2) 初始化osd weights
首先遍历所有的device,这里有osd.0~osd.8共9个OSD。如果该设备权重通过crushtool命令行参数--weight
设置过了,则采用参数传递进的权重。否则,检查该device是否在bucket中有用到,如果用到则将该device权重标志为0x10000,没有被使用则将该设备权重置为0.
再接着调用adjust_weights(weight)
调整设备权重,在函数中需要mark_down_device_ratio > 0
,因此这里并不会被执行。
3) 计算当前active devices
这里所有9个device均为active
4) 遍历所有的rule
从上面的分析,其实就是找到对应的rule,然后在该rule下,针对每一个rep数,对PG进行crushmap映射。
while(rules)
{
for(rep in rep_min:rep_max)
{
do_rule(rule,pg_id,out_vec,rep,device_weight_vec);
}
}
在讨论完了crushtool映射PG的一个整体流程之后,我们接下来就会详细讨论具体的do_rule算法。由于这一部分较为复杂,我们将其作为另外单独一章来讲解。