[ceph-users] CRUSH rule seems to work fine not for all PGs in erasure coded pools

Jakub Jaszewski jaszewski.jakub at gmail.com
Tue Nov 28 05:43:13 PST 2017


Hi, I'm trying to understand erasure coded pools and why CRUSH rules seem
to work for only part of PGs in EC pools.

Basically what I'm trying to do is to check erasure coded pool recovering
behaviour after the single OSD or single HOST failure.
I noticed that in case of HOST failure only part of PGs get recovered to
active+remapped when other PGs remain in active+undersized+degraded state.
Why??
EC pool profile I use is k=3 , m=2.

Also I'm not really sure what is the meaning of all steps of below crush
rule (perhaps it is the root cause).
rule ecpool_3_2 {
ruleset 1
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5 # should I maybe try to increase this number of
retry ?? Can I apply the changes to existing EC crush rule and pool or need
to create a new one ?
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host # Does it allow to choose more than one
OSD from single HOST but first trying to get only one OSD per HOST if there
are enough HOSTs in the cluster?
step emit
}

ceph version 10.2.9 (jewel)

# INITIAL CLUSTER STATE
root at host01:~# ceph osd tree
ID  WEIGHT    TYPE NAME                        UP/DOWN REWEIGHT
PRIMARY-AFFINITY
 -1 218.18401 root default

 -6 218.18401     region MyRegion

 -5 218.18401         datacenter MyDC

 -4 218.18401             room MyRoom

 -3  43.63699                 rack Rack01

 -2  43.63699                     host host01

  0   3.63599                         osd.0         up  1.00000
 1.00000
  3   3.63599                         osd.3         up  1.00000
 1.00000
  4   3.63599                         osd.4         up  1.00000
 1.00000
  6   3.63599                         osd.6         up  1.00000
 1.00000
  8   3.63599                         osd.8         up  1.00000
 1.00000
 10   3.63599                         osd.10        up  1.00000
 1.00000
 12   3.63599                         osd.12        up  1.00000
 1.00000
 14   3.63599                         osd.14        up  1.00000
 1.00000
 16   3.63599                         osd.16        up  1.00000
 1.00000
 19   3.63599                         osd.19        up  1.00000
 1.00000
 22   3.63599                         osd.22        up  1.00000
 1.00000
 25   3.63599                         osd.25        up  1.00000
 1.00000
 -8  43.63699                 rack Rack02

 -7  43.63699                     host host02

  1   3.63599                         osd.1         up  1.00000
 1.00000
  2   3.63599                         osd.2         up  1.00000
 1.00000
  5   3.63599                         osd.5         up  1.00000
 1.00000
  7   3.63599                         osd.7         up  1.00000
 1.00000
  9   3.63599                         osd.9         up  1.00000
 1.00000
 11   3.63599                         osd.11        up  1.00000
 1.00000
 13   3.63599                         osd.13        up  1.00000
 1.00000
 15   3.63599                         osd.15        up  1.00000
 1.00000
 17   3.63599                         osd.17        up  1.00000
 1.00000
 20   3.63599                         osd.20        up  1.00000
 1.00000
 23   3.63599                         osd.23        up  1.00000
 1.00000
 26   3.63599                         osd.26        up  1.00000
 1.00000
-10 130.91000                 rack Rack03

 -9  43.63699                     host host03

 18   3.63599                         osd.18        up  1.00000
 1.00000
 21   3.63599                         osd.21        up  1.00000
 1.00000
 24   3.63599                         osd.24        up  1.00000
 1.00000
 27   3.63599                         osd.27        up  1.00000
 1.00000
 28   3.63599                         osd.28        up  1.00000
 1.00000
 29   3.63599                         osd.29        up  1.00000
 1.00000
 30   3.63599                         osd.30        up  1.00000
 1.00000
 31   3.63599                         osd.31        up  1.00000
 1.00000
 32   3.63599                         osd.32        up  1.00000
 1.00000
 33   3.63599                         osd.33        up  1.00000
 1.00000
 34   3.63599                         osd.34        up  1.00000
 1.00000
 35   3.63599                         osd.35        up  1.00000
 1.00000
-11  43.63699                     host host04

 36   3.63599                         osd.36        up  1.00000
 1.00000
 37   3.63599                         osd.37        up  1.00000
 1.00000
 38   3.63599                         osd.38        up  1.00000
 1.00000
 39   3.63599                         osd.39        up  1.00000
 1.00000
 40   3.63599                         osd.40        up  1.00000
 1.00000
 41   3.63599                         osd.41        up  1.00000
 1.00000
 42   3.63599                         osd.42        up  1.00000
 1.00000
 43   3.63599                         osd.43        up  1.00000
 1.00000
 44   3.63599                         osd.44        up  1.00000
 1.00000
 45   3.63599                         osd.45        up  1.00000
 1.00000
 46   3.63599                         osd.46        up  1.00000
 1.00000
 47   3.63599                         osd.47        up  1.00000
 1.00000
-12  43.63699                     host host05

 48   3.63599                         osd.48        up  1.00000
 1.00000
 49   3.63599                         osd.49        up  1.00000
 1.00000
 50   3.63599                         osd.50        up  1.00000
 1.00000
 51   3.63599                         osd.51        up  1.00000
 1.00000
 52   3.63599                         osd.52        up  1.00000
 1.00000
 53   3.63599                         osd.53        up  1.00000
 1.00000
 54   3.63599                         osd.54        up  1.00000
 1.00000
 55   3.63599                         osd.55        up  1.00000
 1.00000
 56   3.63599                         osd.56        up  1.00000
 1.00000
 57   3.63599                         osd.57        up  1.00000
 1.00000
 58   3.63599                         osd.58        up  1.00000
 1.00000
 59   3.63599                         osd.59        up  1.00000
 1.00000
root at host01:~# ceph -w
    cluster a6f73750-1972-47f6-bcf5-a99753be65ad
     health HEALTH_OK
     monmap e2: 3 mons at {host01=
10.212.32.23:6789/0,host02=10.212.32.24:6789/0,host03=10.212.32.25:6789/0}
            election epoch 22, quorum 0,1,2 host01,host02,host03
     osdmap e527: 60 osds: 60 up, 60 in
            flags sortbitwise,require_jewel_osds
      pgmap v57164: 3736 pgs, 19 pools, 10343 bytes data, 241 objects
            4665 MB used, 218 TB / 218 TB avail
                3736 active+clean

2017-11-28 07:38:52.350228 mon.0 [INF] pgmap v57163: 3736 pgs: 3736
active+clean; 10343 bytes data, 4665 MB used, 218 TB / 218 TB avail
...
root at host01:~#

In the 1st scenario I stop single OSD (id 48, host host05) and after 5
minutes cluster start to recover by remapping PGs using other OSD from HOST
host05.

In the 2nd scenario, I stop all Ceph services on one HOST host05

# FIND ALL PGs USING OSDs FROM HOST host05
root at host01:~# ceph pg dump pgs_brief  |egrep
'\[48|,48|\[49|,49|\[50|,50|\[51|,51|\[52|,52|\[53|,53|\[54|,54|\[55|,55|\[56|,56|\[57|,57|\[58|,58|\[59|,59'
> PGs_on_HOST_host05
dumped pgs_brief in format plain
root at host01:~# wc -l PGs_on_HOST_host05
2556 PGs_on_HOST_host05

# STOP ALL CEPH SERVICES on HOST host05
root at host05:~# systemctl stop ceph.target

root at host01:~# ceph -w
    cluster a6f73750-1972-47f6-bcf5-a99753be65ad
     health HEALTH_OK
     monmap e2: 3 mons at {host01=
10.212.32.23:6789/0,host02=10.212.32.24:6789/0,host03=10.212.32.25:6789/0}
            election epoch 22, quorum 0,1,2 host01,host02,host03
     osdmap e538: 60 osds: 59 up, 59 in
            flags sortbitwise,require_jewel_osds
      pgmap v57405: 3736 pgs, 19 pools, 10343 bytes data, 241 objects
            4581 MB used, 214 TB / 214 TB avail
                3736 active+clean

2017-11-28 08:08:21.349340 mon.0 [INF] pgmap v57405: 3736 pgs: 3736
active+clean; 10343 bytes data, 4581 MB used, 214 TB / 214 TB avail
2017-11-28 08:08:33.082249 mon.0 [INF] osd.57 marked itself down
2017-11-28 08:08:33.082607 mon.0 [INF] osd.49 marked itself down
2017-11-28 08:08:33.082899 mon.0 [INF] osd.59 marked itself down
2017-11-28 08:08:33.083471 mon.0 [INF] osd.56 marked itself down
2017-11-28 08:08:33.084091 mon.0 [INF] osd.58 marked itself down
2017-11-28 08:08:33.084842 mon.0 [INF] osd.53 marked itself down
2017-11-28 08:08:33.085373 mon.0 [INF] osd.50 marked itself down
2017-11-28 08:08:33.085830 mon.0 [INF] osd.54 marked itself down
2017-11-28 08:08:33.086437 mon.0 [INF] osd.55 marked itself down
2017-11-28 08:08:33.086664 mon.0 [INF] osd.52 marked itself down
2017-11-28 08:08:33.086970 mon.0 [INF] osd.51 marked itself down
2017-11-28 08:08:33.246299 mon.0 [INF] osdmap e539: 60 osds: 48 up, 59 in
2017-11-28 08:08:33.253694 mon.0 [INF] pgmap v57406: 3736 pgs: 3736
active+clean; 10343 bytes data, 4581 MB used, 214 TB / 214 TB avail
2017-11-28 08:08:34.333012 mon.0 [INF] osdmap e540: 60 osds: 48 up, 59 in
2017-11-28 08:08:34.348753 mon.0 [INF] pgmap v57407: 3736 pgs: 64 peering,
658 stale+active+clean, 3014 active+clean; 10343 bytes data, 4581 MB used,
214 TB / 214 TB avail
2017-11-28 08:08:35.344372 mon.0 [INF] pgmap v57408: 3736 pgs: 4
active+undersized+degraded, 42 activating+undersized+degraded, 64 peering,
648 stale+active+clean, 2978 active+clean; 10343 bytes data, 4581 MB used,
214 TB / 214 TB avail
2017-11-28 08:08:36.375645 mon.0 [INF] pgmap v57409: 3736 pgs: 268
active+undersized+degraded, 42 activating+undersized+degraded, 64 peering,
578 stale+active+clean, 2784 active+clean; 10343 bytes data, 4584 MB used,
214 TB / 214 TB avail; 24/791 objects degraded (3.034%)
2017-11-28 08:08:37.457164 mon.0 [INF] pgmap v57410: 3736 pgs: 1750
active+undersized+degraded, 42 activating+undersized+degraded, 64 peering,
198 stale+active+clean, 1682 active+clean; 10343 bytes data, 4622 MB used,
214 TB / 214 TB avail; 141/791 objects degraded (17.826%)
2017-11-28 08:08:38.466174 mon.0 [INF] pgmap v57411: 3736 pgs: 2450
active+undersized+degraded, 42 activating+undersized+degraded, 64 peering,
1180 active+clean; 10343 bytes data, 4643 MB used, 214 TB / 214 TB avail;
190/791 objects degraded (24.020%)
2017-11-28 08:08:39.454811 mon.0 [INF] pgmap v57412: 3736 pgs: 2556
active+undersized+degraded, 1180 active+clean; 10343 bytes data, 4645 MB
used, 214 TB / 214 TB avail; 193/791 objects degraded (24.399%)
2017-11-28 08:08:45.202295 mon.0 [INF] HEALTH_WARN; 2556 pgs degraded; 2549
pgs stuck unclean; 2556 pgs undersized; recovery 193/791 objects degraded
(24.399%); 11/59 in osds are down
.... AFTER 5 MINUTES PGs REMAPPING HAS STARTED
2017-11-28 08:12:45.205422 mon.0 [INF] HEALTH_WARN; 2556 pgs degraded; 2556
pgs stuck unclean; 2556 pgs undersized; recovery 193/791 objects degraded
(24.399%); 11/59 in osds are down
2017-11-28 08:12:51.570936 mon.0 [INF] pgmap v57446: 3736 pgs: 2556
active+undersized+degraded, 1180 active+clean; 10343 bytes data, 4632 MB
used, 214 TB / 214 TB avail; 193/791 objects degraded (24.399%)
2017-11-28 08:13:35.060583 mon.0 [INF] osd.49 out (down for 301.868797)
2017-11-28 08:13:35.060723 mon.0 [INF] osd.50 out (down for 301.868797)
2017-11-28 08:13:35.060753 mon.0 [INF] osd.51 out (down for 301.868797)
2017-11-28 08:13:35.060783 mon.0 [INF] osd.52 out (down for 301.868796)
2017-11-28 08:13:35.060812 mon.0 [INF] osd.53 out (down for 301.868796)
2017-11-28 08:13:35.060842 mon.0 [INF] osd.54 out (down for 301.868796)
2017-11-28 08:13:35.060870 mon.0 [INF] osd.55 out (down for 301.868795)
2017-11-28 08:13:35.060928 mon.0 [INF] osd.56 out (down for 301.868795)
2017-11-28 08:13:35.060958 mon.0 [INF] osd.57 out (down for 301.868795)
2017-11-28 08:13:35.060990 mon.0 [INF] osd.58 out (down for 301.868795)
2017-11-28 08:13:35.061021 mon.0 [INF] osd.59 out (down for 301.868794)
2017-11-28 08:13:35.274737 mon.0 [INF] osdmap e541: 60 osds: 48 up, 48 in
2017-11-28 08:13:35.276185 mon.0 [INF] pgmap v57447: 3736 pgs: 2556
active+undersized+degraded, 1180 active+clean; 10343 bytes data, 3773 MB
used, 174 TB / 174 TB avail; 193/791 objects degraded (24.399%)
2017-11-28 08:13:36.330316 mon.0 [INF] osdmap e542: 60 osds: 48 up, 48 in
2017-11-28 08:13:36.334183 mon.0 [INF] pgmap v57448: 3736 pgs: 135
remapped+peering, 2421 active+undersized+degraded, 1180 active+clean; 10343
bytes data, 3775 MB used, 174 TB / 174 TB avail; 174/791 objects degraded
(21.997%)
2017-11-28 08:13:37.289319 mon.0 [INF] osdmap e543: 60 osds: 48 up, 48 in
2017-11-28 08:13:37.326379 mon.0 [INF] pgmap v57449: 3736 pgs: 4
active+undersized+remapped, 900 peering, 329 remapped+peering, 1323
active+undersized+degraded, 1180 active+clean; 10343 bytes data, 3784 MB
used, 174 TB / 174 TB avail; 69/791 objects degraded (8.723%); 65 B/s, 117
objects/s recovering
2017-11-28 08:13:36.172666 osd.28 [INF] 8.6 starting backfill to osd.22
from (0'0,0'0] MAX to 538'46558
2017-11-28 08:13:36.174172 osd.28 [INF] 8.6 starting backfill to osd.47
from (0'0,0'0] MAX to 538'46558
2017-11-28 08:13:36.184611 osd.19 [INF] 8.5 starting backfill to osd.40
from (0'0,0'0] MAX to 538'52902
2017-11-28 08:13:36.190060 osd.24 [INF] 7.3 starting backfill to osd.8 from
(0'0,0'0] MAX to 538'3172
2017-11-28 08:13:36.193337 osd.24 [INF] 7.3 starting backfill to osd.41
from (0'0,0'0] MAX to 538'3172
2017-11-28 08:13:37.517955 osd.21 [INF] 5.144 scrub starts
2017-11-28 08:13:37.518701 osd.21 [INF] 5.144 scrub ok
2017-11-28 08:13:38.235143 mon.0 [INF] osdmap e544: 60 osds: 48 up, 48 in
2017-11-28 08:13:38.250128 mon.0 [INF] pgmap v57450: 3736 pgs: 37
activating, 7 activating+remapped, 4 active+undersized+remapped, 37 active,
902 peering, 87 active+remapped, 313 remapped+peering, 793
active+undersized+degraded, 1556 active+clean; 10343 bytes data, 3789 MB
used, 174 TB / 174 TB avail; 40/791 objects degraded (5.057%); 5/791
objects misplaced (0.632%); 629 B/s, 167 objects/s recovering
2017-11-28 08:13:36.157779 osd.18 [INF] 8.3 starting backfill to osd.17
from (0'0,0'0] MAX to 538'34158
2017-11-28 08:13:38.147555 osd.18 [INF] 5.203 deep-scrub starts
2017-11-28 08:13:38.148310 osd.18 [INF] 5.203 deep-scrub ok
2017-11-28 08:13:38.523380 osd.22 [INF] 5.235 scrub starts
2017-11-28 08:13:38.524181 osd.22 [INF] 5.235 scrub ok
2017-11-28 08:13:39.251064 mon.0 [INF] pgmap v57451: 3736 pgs: 37
activating, 7 activating+remapped, 4 active+undersized+remapped, 50 active,
903 peering, 117 active+remapped, 312 remapped+peering, 625
active+undersized+degraded, 1681 active+clean; 10343 bytes data, 3799 MB
used, 174 TB / 174 TB avail; 25/791 objects degraded (3.161%); 5/791
objects misplaced (0.632%); 620 B/s, 0 keys/s, 58 objects/s recovering
2017-11-28 08:13:36.110274 osd.4 [INF] 8.0 starting backfill to osd.14 from
(0'0,0'0] MAX to 538'49482
2017-11-28 08:13:36.112128 osd.4 [INF] 8.0 starting backfill to osd.23 from
(0'0,0'0] MAX to 538'49482
2017-11-28 08:13:36.127248 osd.4 [INF] 8.0 starting backfill to osd.37 from
(0'0,0'0] MAX to 538'49482
2017-11-28 08:13:40.250559 mon.0 [INF] pgmap v57452: 3736 pgs: 37
activating, 7 activating+remapped, 4 active+undersized+remapped, 52 active,
903 peering, 123 active+remapped, 311 remapped+peering, 590
active+undersized+degraded, 1709 active+clean; 10343 bytes data, 3803 MB
used, 174 TB / 174 TB avail; 25/791 objects degraded (3.161%); 5/791
objects misplaced (0.632%); 77 B/s, 0 keys/s, 14 objects/s recovering
2017-11-28 08:13:36.153569 osd.2 [INF] 8.2 starting backfill to osd.6 from
(0'0,0'0] MAX to 538'49646
2017-11-28 08:13:36.164089 osd.2 [INF] 8.2 starting backfill to osd.34 from
(0'0,0'0] MAX to 538'49646
2017-11-28 08:13:36.217509 osd.10 [INF] 8.1 starting backfill to osd.0 from
(0'0,0'0] MAX to 538'55946
2017-11-28 08:13:36.219512 osd.10 [INF] 8.1 starting backfill to osd.23
from (0'0,0'0] MAX to 538'55946
2017-11-28 08:13:37.806811 osd.10 [INF] 5.318 scrub starts
2017-11-28 08:13:37.807563 osd.10 [INF] 5.318 scrub ok
2017-11-28 08:13:36.235023 osd.45 [INF] 8.4 starting backfill to osd.2 from
(0'0,0'0] MAX to 538'65004
2017-11-28 08:13:36.236576 osd.45 [INF] 8.4 starting backfill to osd.8 from
(0'0,0'0] MAX to 538'65004
2017-11-28 08:13:39.607783 osd.3 [INF] 5.185 scrub starts
2017-11-28 08:13:39.608687 osd.3 [INF] 5.185 scrub ok
2017-11-28 08:13:41.357592 mon.0 [INF] pgmap v57453: 3736 pgs: 37
activating, 7 activating+remapped, 4 active+undersized+remapped, 75 active,
869 peering, 157 active+remapped, 174 remapped+peering, 540
active+undersized+degraded, 1873 active+clean; 10343 bytes data, 3813 MB
used, 174 TB / 174 TB avail; 22/791 objects degraded (2.781%); 5/791
objects misplaced (0.632%); 87 B/s, 23 objects/s recovering
2017-11-28 08:13:42.397617 mon.0 [INF] pgmap v57454: 3736 pgs: 146 active,
3 peering, 338 active+remapped, 540 active+undersized+degraded, 2709
active+clean; 10343 bytes data, 3835 MB used, 174 TB / 174 TB avail; 31/791
objects degraded (3.919%); 14/791 objects misplaced (1.770%); 2765 B/s, 27
keys/s, 56 objects/s recovering
2017-11-28 08:13:37.396991 osd.14 [INF] 5.332 scrub starts
2017-11-28 08:13:37.397496 osd.14 [INF] 5.332 scrub ok
2017-11-28 08:13:42.524505 osd.6 [INF] 3.185 scrub starts
2017-11-28 08:13:42.525389 osd.6 [INF] 3.185 scrub ok
2017-11-28 08:13:43.385342 mon.0 [INF] pgmap v57455: 3736 pgs: 146 active,
338 active+remapped, 540 active+undersized+degraded, 2712 active+clean;
10343 bytes data, 3847 MB used, 174 TB / 174 TB avail; 31/791 objects
degraded (3.919%); 14/791 objects misplaced (1.770%); 2768 B/s, 28 keys/s,
33 objects/s recovering
2017-11-28 08:13:43.397979 osd.14 [INF] 8.0 scrub starts
2017-11-28 08:13:43.401167 osd.14 [INF] 8.0 scrub ok
2017-11-28 08:13:44.392089 mon.0 [INF] pgmap v57456: 3736 pgs: 146 active,
338 active+remapped, 540 active+undersized+degraded, 2712 active+clean;
10343 bytes data, 3848 MB used, 174 TB / 174 TB avail; 31/791 objects
degraded (3.919%); 14/791 objects misplaced (1.770%)
2017-11-28 08:13:45.206293 mon.0 [INF] HEALTH_WARN; 540 pgs degraded; 540
pgs stuck degraded; 1024 pgs stuck unclean; 540 pgs stuck undersized; 540
pgs undersized; recovery 31/791 objects degraded (3.919%); recovery 14/791
objects misplaced (1.770%)
...
2017-11-28 08:14:10.362591 osd.44 [WRN] 1 slow requests, 1 included below;
oldest blocked for > 30.779132 secs
2017-11-28 08:14:10.362600 osd.44 [WRN] slow request 30.779132 seconds old,
received at 2017-11-28 08:13:39.583415: osd_op(client.4740.0:153303
4.31099063 (undecoded) ondisk+write+known_if_redirected e541) currently no
flag points reached
2017-11-28 08:14:11.579659 mon.0 [INF] pgmap v57474: 3736 pgs: 146 active,
338 active+remapped, 540 active+undersized+degraded, 2712 active+clean;
10343 bytes data, 3852 MB used, 174 TB / 174 TB avail; 31/791 objects
degraded (3.919%); 14/791 objects misplaced (1.770%)
2017-11-28 08:14:40.365929 osd.44 [WRN] 1 slow requests, 1 included below;
oldest blocked for > 60.782471 secs
2017-11-28 08:14:40.365934 osd.44 [WRN] slow request 60.782471 seconds old,
received at 2017-11-28 08:13:39.583415: osd_op(client.4740.0:153303
4.31099063 (undecoded) ondisk+write+known_if_redirected e541) currently no
flag points reached
2017-11-28 08:14:45.207183 mon.0 [INF] HEALTH_WARN; 540 pgs degraded; 540
pgs stuck degraded; 1024 pgs stuck unclean; 540 pgs stuck undersized; 540
pgs undersized; 1 requests are blocked > 32 sec; recovery 31/791 objects
degraded (3.919%); recovery 14/791 objects misplaced (1.770%)
2017-11-28 08:14:46.657287 mon.0 [INF] pgmap v57478: 3736 pgs: 146 active,
338 active+remapped, 540 active+undersized+degraded, 2712 active+clean;
10343 bytes data, 3852 MB used, 174 TB / 174 TB avail; 31/791 objects
degraded (3.919%); 14/791 objects misplaced (1.770%)
2017-11-28 08:15:40.372583 osd.44 [WRN] 1 slow requests, 1 included below;
oldest blocked for > 120.789122 secs
2017-11-28 08:15:40.372589 osd.44 [WRN] slow request 120.789122 seconds
old, received at 2017-11-28 08:13:39.583415: osd_op(client.4740.0:153303
4.31099063 (undecoded) ondisk+write+known_if_redirected e541) currently no
flag points reached
2017-11-28 08:15:56.664417 mon.0 [INF] pgmap v57479: 3736 pgs: 146 active,
338 active+remapped, 540 active+undersized+degraded, 2712 active+clean;
10343 bytes data, 3852 MB used, 174 TB / 174 TB avail; 31/791 objects
degraded (3.919%); 14/791 objects misplaced (1.770%)

# NOW CEPH STATUS IS
root at host01:~# ceph status
    cluster a6f73750-1972-47f6-bcf5-a99753be65ad
     health HEALTH_WARN
            540 pgs degraded
            540 pgs stuck degraded
            1024 pgs stuck unclean
            540 pgs stuck undersized
            540 pgs undersized
            1 requests are blocked > 32 sec
            recovery 31/791 objects degraded (3.919%)
            recovery 14/791 objects misplaced (1.770%)
     monmap e2: 3 mons at {host01=
10.212.32.23:6789/0,host02=10.212.32.24:6789/0,host03=10.212.32.25:6789/0}
            election epoch 22, quorum 0,1,2 host01,host02,host03
     osdmap e544: 60 osds: 48 up, 48 in; 1024 remapped pgs
            flags sortbitwise,require_jewel_osds
      pgmap v57508: 3736 pgs, 19 pools, 10343 bytes data, 241 objects
            3786 MB used, 174 TB / 174 TB avail
            31/791 objects degraded (3.919%)
            14/791 objects misplaced (1.770%)
                2712 active+clean
                 540 active+undersized+degraded
                 338 active+remapped
                 146 active
root at host01:~#

# LOOKS THAT 338 PGs IN ERASURE CODED POOLS HAVE BEEN REMAPPED
# I DONT GET WHY 540 PGs STILL ENCOUNTER active+undersized+degraded STATE
root at host01:~# ceph pg dump pgs_brief  |grep 'active+remapped'
dumped pgs_brief in format plain
16.6f active+remapped [43,2147483647,2,31,12] 43 [43,33,2,31,12] 43
16.6e active+remapped [10,5,35,44,2147483647] 10 [10,5,35,44,41] 10
....
root at host01:~# egrep '16.6f|16.6e' PGs_on_HOST_host05
16.6f active+clean [43,33,2,59,12] 43 [43,33,2,59,12] 43
16.6e active+clean [10,5,49,35,41] 10 [10,5,49,35,41] 10
root at host01:~#

root at host01:~# ceph pg dump pgs_brief  |grep 'active+undersized+degraded'
dumped pgs_brief in format plain
19.6c active+undersized+degraded [24,20,19,2147483647,46] 24
[24,20,19,2147483647,46] 24
17.6e active+undersized+degraded [19,2147483647,36,31,5] 19
[19,2147483647,36,31,5] 19
...
root at host01:~# egrep '19.6c|17.6e' PGs_on_HOST_host05
19.6c active+clean [24,20,19,58,46] 24 [24,20,19,58,46] 24
17.6e active+clean [19,59,36,31,5] 19 [19,59,36,31,5] 19
root at host01:~#


# POOLS DETAILS
root at host01:~# ceph osd lspools
0 rbd,1 .rgw.root,2 vms,3 images,4 default.rgw.control,5 volumes,6
default.rgw.data.root.old,7 default.rgw.gc,8 default.rgw.log,9
default.rgw.users.uid,10 default.rgw.users.keys,11
default.rgw.users.email,12 default.rgw.buckets.index,13
default.rgw.usage,14 default.rgw.buckets.data.old,15 ecpool_3_2,16
default.rgw.data.root,17 default.rgw.data.root.new01,19
default.rgw.buckets.data,

rbd size: 3 pgp_num: 64
.rgw.root size: 3 pgp_num: 8
vms size: 3 pgp_num: 1024
images size: 3 pgp_num: 512
default.rgw.control size: 3 pgp_num: 8
volumes size: 3 pgp_num: 1024
default.rgw.data.root.old size: 3 pgp_num: 8
default.rgw.gc size: 3 pgp_num: 8
default.rgw.log size: 3 pgp_num: 8
default.rgw.users.uid size: 3 pgp_num: 8
default.rgw.users.keys size: 3 pgp_num: 8
default.rgw.users.email size: 3 pgp_num: 8
default.rgw.buckets.index size: 3 pgp_num: 8
default.rgw.usage size: 3 pgp_num: 8
default.rgw.buckets.data.old size: 3 pgp_num: 8
ecpool_3_2 size: 5 pgp_num: 256
default.rgw.data.root size: 5 pgp_num: 256
default.rgw.data.root.new01 size: 5 pgp_num: 256
default.rgw.buckets.data size: 5 pgp_num: 256

# EC pools use below profile
root at host01:~# ceph osd erasure-code-profile get ec_profile_k_3_m_2
jerasure-per-chunk-alignment=false
k=3
m=2
plugin=jerasure
ruleset-failure-domain=host
ruleset-root=default
technique=reed_sol_van
w=8
root at host01:~#

# PGs that are in active+remapped or active+undersized+degraded state
belong to erasure coded pools only
root at host01:~# ceph pg dump pgs_brief  |grep 'active+remapped' |cut -d '.'
-f1 |sort |uniq
dumped pgs_brief in format plain
15
16
17
19
root at host01:~# ceph pg dump pgs_brief  |grep 'active+undersized+degraded'
|cut -d '.' -f1 |sort |uniq
dumped pgs_brief in format plain
15
16
17
19

# FINALLY, CRUSH MAP IS
root at host01:~# cat crushmap.txt
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable straw_calc_version 1

# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5
device 6 osd.6
device 7 osd.7
device 8 osd.8
device 9 osd.9
device 10 osd.10
device 11 osd.11
device 12 osd.12
device 13 osd.13
device 14 osd.14
device 15 osd.15
device 16 osd.16
device 17 osd.17
device 18 osd.18
device 19 osd.19
device 20 osd.20
device 21 osd.21
device 22 osd.22
device 23 osd.23
device 24 osd.24
device 25 osd.25
device 26 osd.26
device 27 osd.27
device 28 osd.28
device 29 osd.29
device 30 osd.30
device 31 osd.31
device 32 osd.32
device 33 osd.33
device 34 osd.34
device 35 osd.35
device 36 osd.36
device 37 osd.37
device 38 osd.38
device 39 osd.39
device 40 osd.40
device 41 osd.41
device 42 osd.42
device 43 osd.43
device 44 osd.44
device 45 osd.45
device 46 osd.46
device 47 osd.47
device 48 osd.48
device 49 osd.49
device 50 osd.50
device 51 osd.51
device 52 osd.52
device 53 osd.53
device 54 osd.54
device 55 osd.55
device 56 osd.56
device 57 osd.57
device 58 osd.58
device 59 osd.59

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root

# buckets
host host01 {
id -2 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.0 weight 3.636
item osd.3 weight 3.636
item osd.4 weight 3.636
item osd.6 weight 3.636
item osd.8 weight 3.636
item osd.10 weight 3.636
item osd.12 weight 3.636
item osd.14 weight 3.636
item osd.16 weight 3.636
item osd.19 weight 3.636
item osd.22 weight 3.636
item osd.25 weight 3.636
}
rack Rack01 {
id -3 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item host01 weight 43.637
}
host host02 {
id -7 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.1 weight 3.636
item osd.2 weight 3.636
item osd.5 weight 3.636
item osd.7 weight 3.636
item osd.9 weight 3.636
item osd.11 weight 3.636
item osd.13 weight 3.636
item osd.15 weight 3.636
item osd.17 weight 3.636
item osd.20 weight 3.636
item osd.23 weight 3.636
item osd.26 weight 3.636
}
rack Rack02 {
id -8 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item host02 weight 43.637
}
host host03 {
id -9 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.18 weight 3.636
item osd.21 weight 3.636
item osd.24 weight 3.636
item osd.27 weight 3.636
item osd.28 weight 3.636
item osd.29 weight 3.636
item osd.30 weight 3.636
item osd.31 weight 3.636
item osd.32 weight 3.636
item osd.33 weight 3.636
item osd.34 weight 3.636
item osd.35 weight 3.636
}
host host04 {
id -11 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.36 weight 3.636
item osd.37 weight 3.636
item osd.38 weight 3.636
item osd.39 weight 3.636
item osd.40 weight 3.636
item osd.41 weight 3.636
item osd.42 weight 3.636
item osd.43 weight 3.636
item osd.44 weight 3.636
item osd.45 weight 3.636
item osd.46 weight 3.636
item osd.47 weight 3.636
}
host host05 {
id -12 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.48 weight 3.636
item osd.49 weight 3.636
item osd.50 weight 3.636
item osd.51 weight 3.636
item osd.52 weight 3.636
item osd.53 weight 3.636
item osd.54 weight 3.636
item osd.55 weight 3.636
item osd.56 weight 3.636
item osd.57 weight 3.636
item osd.58 weight 3.636
item osd.59 weight 3.636
}
rack Rack03 {
id -10 # do not change unnecessarily
# weight 130.910
alg straw
hash 0 # rjenkins1
item host03 weight 43.637
item host04 weight 43.637
item host05 weight 43.637
}
room MyRoom {
id -4 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item Rack01 weight 43.637
item Rack02 weight 43.637
item Rack03 weight 130.910
}
datacenter MyDC {
id -5 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item MyRoom weight 218.184
}
region MyRegion {
id -6 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item MyDC weight 218.184
}
root default {
id -1 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item MyRegion weight 218.184
}

# rules
rule replicated_ruleset {
ruleset 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}
rule ecpool_3_2 {
ruleset 1
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule default.rgw.data.root.new {
ruleset 2
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule default.rgw.data.root.new01 {
ruleset 3
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule default.rgw.buckets.data.new {
ruleset 4
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}

# end crush map
root at host01:~#

Jakub
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.ceph.com/pipermail/ceph-users-ceph.com/attachments/20171128/1fbd722f/attachment.html>


More information about the ceph-users mailing list