Erasure Code with Autoscaler and Backfill_toofull - ceph-users

27 Mar 2024

Hey,

I'm running ceph version 18.2.1 (reef) but this problem must have existed a
long time before reef.

The documentation says the autoscaler will target 100 pgs per OSD but I'm
only seeing ~10. My erasure encoding is a stripe of 6 data 3 parity.
Could that be the reason? PGs numbers for that EC pool are therefore
multiplied by k+m by the autoscaler calculations?

Is backfill_toofull calculated against the total size of the PG against
every OSD it is destined for? For my case I have ~1TiB PGs because the
autoscaler is creating only 10 per host, and then backfill too full is
considering that one of my OSDs only has 500GiB free, although that doesn't
quite add up either because two 1TiB PGs are backfilling two pg's that have
OSD 1 in them. My backfill full ratio is set to 97%.

Would it be correct for me to change the autoscaler to target ~700 pgs per
osd and bias for storagefs and all EC pools to k+m? Should that be the
default or the documentation recommended value?

How scary is changing PG_NUM while backfilling misplaced PGs? It seems like
there's a chance the backfill might succeed so I think I can wait.

Any help is greatly appreciated, I've tried to include as much of the
relevant debugging output as I can think of.

Daniel

# ceph osd ls | wc -l
44
# ceph pg ls | wc -l
484

# ceph osd pool autoscale-status
POOL                     SIZE  TARGET SIZE   RATE  RAW CAPACITY   RATIO
 TARGET RATIO  EFFECTIVE RATIO  BIAS  PG_NUM  NEW PG_NUM  AUTOSCALE  BULK
.rgw.root              216.0k                 3.0        480.2T  0.0000
                             1.0      32              on         False
default.rgw.control        0                  3.0        480.2T  0.0000
                             1.0      32              on         False
default.rgw.meta           0                  3.0        480.2T  0.0000
                             1.0      32              on         False
default.rgw.log         1636k                 3.0        480.2T  0.0000
                             1.0      32              on         False
storagefs              233.5T                 1.5        480.2T  0.7294
                             1.0     256              on         False
storagefs-meta         850.2M                 4.0        480.2T  0.0000
                             4.0      32              on         False
storagefs_wide         355.3G               1.375        480.2T  0.0010
                             1.0      32              on         False
.mgr                   457.3M                 3.0        480.2T  0.0000
                             1.0       1              on         False
mgr-backup-2022-08-19  370.6M                 3.0        480.2T  0.0000
                             1.0      32              on         False

# ceph osd pool ls detail | column -t
pool  15  '.rgw.root'              replicated  size     3    min_size  2
crush_rule  0  object_hash  rjenkins  pg_num       32        pgp_num  32
autoscale_mode  on
pool  16  'default.rgw.control'    replicated  size     3    min_size  2
crush_rule  0  object_hash  rjenkins  pg_num       32        pgp_num  32
autoscale_mode  on
pool  17  'default.rgw.meta'       replicated  size     3    min_size  2
crush_rule  0  object_hash  rjenkins  pg_num       32        pgp_num  32
autoscale_mode  on
pool  18  'default.rgw.log'        replicated  size     3    min_size  2
crush_rule  0  object_hash  rjenkins  pg_num       32        pgp_num  32
autoscale_mode  on
pool  36  'storagefs'              erasure     profile  6.3  size      9
min_size    7  crush_rule   2         object_hash  rjenkins  pg_num   256
 pgp_num         256  autoscale_mode  on
pool  37  'storagefs-meta'         replicated  size     4    min_size  1
crush_rule  0  object_hash  rjenkins  pg_num       32        pgp_num  32
autoscale_mode  on
pool  45  'storagefs_wide'         erasure     profile  8.3  size      11
 min_size    9  crush_rule   8         object_hash  rjenkins  pg_num   32
pgp_num         32   autoscale_mode  on
pool  46  '.mgr'                   replicated  size     3    min_size  2
crush_rule  0  object_hash  rjenkins  pg_num       1         pgp_num  1
 autoscale_mode  on
pool  48  'mgr-backup-2022-08-19'  replicated  size     3    min_size  2
crush_rule  0  object_hash  rjenkins  pg_num       32        pgp_num  32
autoscale_mode  on

# ceph osd erasure-code-profile get 6.3
crush-device-class=
crush-failure-domain=host
crush-root=default
jerasure-per-chunk-alignment=false
k=6
m=3
plugin=jerasure
technique=reed_sol_van
w=8

# ceph pg ls | awk 'NR==1 || /backfill_toofull/' | awk '{print $1"
"$2"
"$4" "$6" "$11" "$15" "$16}' | column -t
PG     OBJECTS  MISPLACED  BYTES         STATE
UP                              ACTING
36.f   222077   141392     953817797727  active+remapped+backfill_toofull
 [1,27,41,8,36,17,14,40,32]p1    [33,32,29,23,16,17,28,1,14]p33
36.5c  221761   147015     950692130045  active+remapped+backfill_toofull
 [26,27,40,29,1,37,39,11,42]p26  [12,24,4,2,31,25,17,33,8]p12
36.60  222710   0          957109050809  active+remapped+backfill_toofull
 [41,34,22,3,1,35,9,39,29]p41    [2,34,22,3,27,32,28,24,1]p2
36.6b  222202   427168     953843892012  active+remapped+backfill_toofull
 [20,15,7,21,37,1,38,17,32]p20   [7,2,32,26,5,35,24,17,23]p7
36.74  222681   777546     957679960067  active+remapped+backfill_toofull
 [42,24,12,34,38,10,27,1,25]p42  [34,33,12,0,19,14,17,30,25]p34
36.7b  222974   1560818    957691042940  active+remapped+backfill_toofull
 [2,35,27,1,20,18,19,12,8]p2     [31,23,21,24,35,18,19,33,25]p31
36.82  222362   1998670    954507657022  active+remapped+backfill_toofull
 [37,22,1,38,11,23,27,32,33]p37  [27,33,0,32,5,25,20,13,15]p27
36.b5  221676   1330056    953443725830  active+remapped+backfill_toofull
 [6,8,38,12,21,1,39,34,27]p6     [33,8,26,12,3,10,22,34,1]p33
36.b6  222669   1335327    956973704883  active+remapped+backfill_toofull
 [11,13,41,4,12,34,29,6,1]p11    [2,29,34,4,12,9,15,6,28]p2
36.e0  221518   1772144    952581426388  active+remapped+backfill_toofull
 [1,27,21,31,30,23,37,13,28]p1   [25,21,14,31,1,2,34,17,24]p25

ceph pg ls | awk 'NR==1 || /backfilling/' | grep -e BYTES -e '\[1' -e
',1,'
-e '1\]' | awk '{print $1" "$2" "$4" "$6"
"$11" "$15" "$16}' | column -t
PG     OBJECTS  MISPLACED  BYTES         STATE                        UP
                           ACTING
36.4a  221508   89144      951346455917  active+remapped+backfilling
 [40,43,33,32,30,38,22,35,9]p40  [27,10,20,7,30,21,1,28,31]p27
36.79  222315   1111575    955797107713  active+remapped+backfilling
 [1,36,31,33,25,23,14,3,13]p1    [27,6,31,23,25,5,14,29,13]p27
36.8d  222229   1284156    955234423342  active+remapped+backfilling
 [35,34,27,37,38,36,43,3,16]p35  [35,34,15,26,1,11,27,18,16]p35
36.ba  222039   0          952547107971  active+remapped+backfilling
 [0,40,33,23,41,4,27,22,28]p0    [0,35,33,27,1,3,30,22,28]p0
36.da  221607   277464     951599928383  active+remapped+backfilling
 [21,31,8,9,11,25,36,23,28]p21   [0,10,1,22,33,11,35,15,28]p0
36.db  221685   58816      951420054091  active+remapped+backfilling
 [3,28,12,13,1,38,40,35,43]p3    [27,20,17,21,1,23,28,24,31]p27

# ceph osd df | sort -nk 17 | tail -n 5
21    hdd   9.09598   1.00000  9.1 TiB  7.7 TiB  7.7 TiB      0 B    31 GiB
  1.4 TiB  84.62  1.16   68      up
24    hdd   9.09598   1.00000  9.1 TiB  7.7 TiB  7.7 TiB    1 KiB    25 GiB
  1.4 TiB  84.98  1.16   69      up
29    hdd   9.09569   1.00000  9.1 TiB  8.0 TiB  8.0 TiB   72 MiB    23 GiB
  1.1 TiB  88.42  1.21   73      up
13    hdd   9.09569   1.00000  9.1 TiB  8.1 TiB  8.1 TiB    1 KiB    22 GiB
 1023 GiB  89.02  1.22   76      up
 1    hdd   7.27698   1.00000  7.3 TiB  6.8 TiB  6.8 TiB   27 MiB    18 GiB
  451 GiB  93.94  1.28   64      up

# cat /etc/ceph/ceph.conf | grep full
mon_osd_full_ratio = .98
mon_osd_nearfull_ratio = .96
mon_osd_backfillfull_ratio = .97
osd_backfill_full_ratio = .97
osd_failsafe_full_ratio = .99