Hello
In order to reduce the rebalacing load when removing the disk, a test was conducted to
adjust the reweight value of the OSD.
When the OSD reweight value was set to 0 during the test, an assert fail occurred in the
MGR when the OSD was not in the down state.
(It was a phenomenon that MGR went down after restarting was attempted 3 times)
Regardless of the OSD status, even if the OSD reweight value is set to 0, MGR should not
be affected. Is there anything you would like to share about this?
Below is the output log contents and related code parts.
[Ceph crash info]
------------------------------------------------------------------------------
{
"archived": "2023-07-25 07:57:04.772224",
"assert_condition": "osd_weight.count(oid)",
"assert_file":
"/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc",
"assert_func": "float
OSDMap::calc_deviations(ceph::common::CephContext*, const std::map<int,
std::set<pg_t> >&, const std::map<int, float>&, float,
std::map<int, float>&, std::multimap<float, int>&, float&)",
"assert_line": 5155,
"assert_msg":
"/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc:
In function 'float OSDMap::calc_deviations(ceph::common::CephContext*, const
std::map<int, std::set<pg_t> >&, const std::map<int, float>&,
float, std::map<int, float>&, std::multimap<float, int>&,
float&)' thread 7f56bf420700 time
2023-07-25T07:49:30.155196+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc:
5155: FAILED ceph_assert(osd_weight.count(oid))\n",
"assert_thread_name": "balancer",
"backtrace": [
"/lib64/libpthread.so.0(+0x12ce0) [0x7f570b3c3ce0]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0)
[0x7f570c5a4cd2]",
"/usr/lib64/ceph/libceph-common.so.2(+0x283e95) [0x7f570c5a4e95]",
"(OSDMap::calc_deviations(ceph::common::CephContext*, std::map<int,
std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> >,
std::less<int>, std::allocator<std::pair<int const, std::set<pg_t,
std::less<pg_t>, std::allocator<pg_t> > > > > const&,
std::map<int, float, std::less<int>, std::allocator<std::pair<int const,
float> > > const&, float, std::map<int, float, std::less<int>,
std::allocator<std::pair<int const, float> > >&,
std::multimap<float, int, std::less<float>, std::allocator<std::pair<float
const, int> > >&, float&)+0xe0) [0x7f570ca71490]",
"(OSDMap::calc_pg_upmaps(ceph::common::CephContext*, unsigned int, int,
std::set<long, std::less<long>, std::allocator<long> > const&,
OSDMap::Incremental*, unsigned int*)+0x389) [0x7f570ca75829]",
"/usr/bin/ceph-mgr(+0x299406) [0x555f39368406]",
"/lib64/libpython3.6m.so.1.0(+0x19d0d7) [0x7f570d4d80d7]",
"_PyEval_EvalFrameDefault()",
"/lib64/libpython3.6m.so.1.0(+0xf9984) [0x7f570d434984]",
"/lib64/libpython3.6m.so.1.0(+0x17a030) [0x7f570d4b5030]",
"/lib64/libpython3.6m.so.1.0(+0x19d377) [0x7f570d4d8377]",
"_PyEval_EvalFrameDefault()",
"/lib64/libpython3.6m.so.1.0(+0x179e48) [0x7f570d4b4e48]",
"/lib64/libpython3.6m.so.1.0(+0x19d377) [0x7f570d4d8377]",
"_PyEval_EvalFrameDefault()",
"/lib64/libpython3.6m.so.1.0(+0x179e48) [0x7f570d4b4e48]",
"/lib64/libpython3.6m.so.1.0(+0x19d377) [0x7f570d4d8377]",
"_PyEval_EvalFrameDefault()",
"/lib64/libpython3.6m.so.1.0(+0xfa2f6) [0x7f570d4352f6]",
"_PyFunction_FastCallDict()",
"_PyObject_FastCallDict()",
"/lib64/libpython3.6m.so.1.0(+0x10db30) [0x7f570d448b30]",
"_PyObject_FastCallDict()",
"PyObject_CallMethod()",
"(PyModuleRunner::serve()+0x66) [0x555f39363e06]",
"(PyModuleRunner::PyModuleRunnerThread::entry()+0x3e3)
[0x555f39365443]",
"/lib64/libpthread.so.0(+0x81ca) [0x7f570b3b91ca]",
"clone()"
],
"ceph_version": "17.2.3",
"crash_id":
"2023-07-25T07:49:30.158977Z_893ea7cd-a5d9-4e57-9ea0-86978b1300cf",
"entity_name": "mgr.eyb-ceph-01.axqecw",
"os_id": "centos",
"os_name": "CentOS Stream",
"os_version": "8",
"os_version_id": "8",
"process_name": "ceph-mgr",
"stack_sig":
"4195797c9f0eefb87056e4b2697fb9367d000d73fc09bdea7f52dbe5b600ccca",
"timestamp": "2023-07-25T07:49:30.158977Z",
"utsname_hostname": "eyb-ceph-01",
"utsname_machine": "x86_64",
"utsname_release": "5.4.0-125-generic",
"utsname_sysname": "Linux",
"utsname_version": "#141-Ubuntu SMP Wed Aug 10 13:42:03 UTC 2022"
}
[MGR Log]
/var/log/ceph/<fsid>/ceph-mgr*.log
------------------------------------------------------------------------------
2023-08-22T11:06:56.402+0000 7fbf43d5e700 0 [balancer INFO root] Optimize plan
auto_2023-08-22_11:06:56
2023-08-22T11:06:56.402+0000 7fbf43d5e700 0 [balancer INFO root] Mode upmap, max
misplaced 0.050000
2023-08-22T11:06:56.402+0000 7fbf43d5e700 0 [balancer INFO root] do_upmap
2023-08-22T11:06:56.402+0000 7fbf43d5e700 0 [balancer INFO root] pools
['rp3pool', 'ec195-4k-pool', 'default.rgw.log',
'ec93pool-rbd', 'suwon.bucket.index', 'default.rgw.buckets.index',
'.rgw.root', 'ec93-non-stripe-pool', 'default.rgw.control',
'.mgr', 'default.rgw.meta', 'default.rgw.otp',
'default.rgw.buckets.non-ec', 'ec195-64k-pool', 'ec195-16k-pool']
2023-08-22T11:06:56.450+0000 7fbf43d5e700 -1
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc:
In function 'float OSDMap::calc_deviations(ceph::common::CephContext*, const
std::map<int, std::set<pg_t> >&, const std::map<int, float>&,
float, std::map<int, float>&, std::multimap<float, int>&,
float&)' thread 7fbf43d5e700 time 2023-08-22T11:06:56.450708+0000
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.3/rpm/el8/BUILD/ceph-17.2.3/src/osd/OSDMap.cc:
5155: FAILED ceph_assert(osd_weight.count(oid))
ceph version 17.2.3 (dff484dfc9e19a9819f375586300b3b79d80034d) quincy (stable)
1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152)
[0x7fbfadff5c74]
2: /usr/lib64/ceph/libceph-common.so.2(+0x283e95) [0x7fbfadff5e95]
3: (OSDMap::calc_deviations(ceph::common::CephContext*, std::map<int,
std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> >,
std::less<int>, std::allocator<std::pair<int const, std::set<pg_t,
std::less<pg_t>, std::allocator<pg_t> > > > > const&,
std::map<int, float, std::less<int>, std::allocator<std::pair<int const,
float> > > const&, float, std::map<int, float, std::less<int>,
std::allocator<std::pair<int const, float> > >&,
std::multimap<float, int, std::less<float>, std::allocator<std::pair<float
const, int> > >&, float&)+0xe0) [0x7fbfae4c2490]
4: (OSDMap::calc_pg_upmaps(ceph::common::CephContext*, unsigned int, int,
std::set<long, std::less<long>, std::allocator<long> > const&,
OSDMap::Incremental*, unsigned int*)+0x389) [0x7fbfae4c6829]
5: /usr/bin/ceph-mgr(+0x299406) [0x55e50358a406]
6: /lib64/libpython3.6m.so.1.0(+0x19d0d7) [0x7fbfaef290d7]
7: _PyEval_EvalFrameDefault()
8: /lib64/libpython3.6m.so.1.0(+0xf9984) [0x7fbfaee85984]
9: /lib64/libpython3.6m.so.1.0(+0x17a030) [0x7fbfaef06030]
10: /lib64/libpython3.6m.so.1.0(+0x19d377) [0x7fbfaef29377]
11: _PyEval_EvalFrameDefault()
12: /lib64/libpython3.6m.so.1.0(+0x179e48) [0x7fbfaef05e48]
13: /lib64/libpython3.6m.so.1.0(+0x19d377) [0x7fbfaef29377]
14: _PyEval_EvalFrameDefault()
15: /lib64/libpython3.6m.so.1.0(+0x179e48) [0x7fbfaef05e48]
16: /lib64/libpython3.6m.so.1.0(+0x19d377) [0x7fbfaef29377]
17: _PyEval_EvalFrameDefault()
18: /lib64/libpython3.6m.so.1.0(+0xfa2f6) [0x7fbfaee862f6]
19: _PyFunction_FastCallDict()
20: _PyObject_FastCallDict()
21: /lib64/libpython3.6m.so.1.0(+0x10db30) [0x7fbfaee99b30]
22: _PyObject_FastCallDict()
23: PyObject_CallMethod()
24: (PyModuleRunner::serve()+0x66) [0x55e503585e06]
25: (PyModuleRunner::PyModuleRunnerThread::entry()+0x3e3) [0x55e503587443]
26: /lib64/libpthread.so.0(+0x81ca) [0x7fbface0a1ca]
27: clone()
2023-08-22T11:06:56.450+0000 7fbf43d5e700 -1 *** Caught signal (Aborted) **
in thread 7fbf43d5e700 thread_name:balancer
ceph version 17.2.3 (dff484dfc9e19a9819f375586300b3b79d80034d) quincy (stable)
1: /lib64/libpthread.so.0(+0x12ce0) [0x7fbface14ce0]
2: gsignal()
3: abort()
4: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0)
[0x7fbfadff5cd2]
5: /usr/lib64/ceph/libceph-common.so.2(+0x283e95) [0x7fbfadff5e95]
6: (OSDMap::calc_deviations(ceph::common::CephContext*, std::map<int,
std::set<pg_t, std::less<pg_t>, std::allocator<pg_t> >,
std::less<int>, std::allocator<std::pair<int const, std::set<pg_t,
std::less<pg_t>, std::allocator<pg_t> > > > > const&,
std::map<int, float, std::less<int>, std::allocator<std::pair<int const,
float> > > const&, float, std::map<int, float, std::less<int>,
std::allocator<std::pair<int const, float> > >&,
std::multimap<float, int, std::less<float>, std::allocator<std::pair<float
const, int> > >&, float&)+0xe0) [0x7fbfae4c2490]
7: (OSDMap::calc_pg_upmaps(ceph::common::CephContext*, unsigned int, int,
std::set<long, std::less<long>, std::allocator<long> > const&,
OSDMap::Incremental*, unsigned int*)+0x389) [0x7fbfae4c6829]
8: /usr/bin/ceph-mgr(+0x299406) [0x55e50358a406]
[Related code]
OSDMap.cc
------------------------------------------------------------------------------
float OSDMap::calc_deviations (
CephContext *cct,
const map<int,set<pg_t>>& pgs_by_osd,
const map<int,float>& osd_weight,
float pgs_per_weight,
map<int,float>& osd_deviation,
multimap<float,int>& deviation_osd,
float& stddev) // return current max deviation
{
//
// This function calculates the 2 maps osd_deviation and deviation_osd which
// hold the deviation between the current number of PGs which map to an OSD
// and the optimal number. Ot also calculates the stddev of the deviations and
// returns the current max deviation.
// NOTE - the calculation is not exactly stddev it is actually sttdev^2 but as
// long as it is monotonic with stddev (and it is), it is sufficient for
// the balancer code.
//
float cur_max_deviation = 0.0;
stddev = 0.0;
for (auto& [oid, opgs] : pgs_by_osd) {
// make sure osd is still there (belongs to this crush-tree)
ceph_assert(osd_weight.count(oid));
float target = osd_weight.at(oid) * pgs_per_weight;
float deviation = (float)opgs.size() - target;
ldout(cct, 20) << " osd." << oid
<< "\tpgs " << opgs.size()
<< "\ttarget " << target
<< "\tdeviation " << deviation
<< dendl;
osd_deviation[oid] = deviation;
deviation_osd.insert(make_pair(deviation, oid));
stddev += deviation * deviation;
if (fabsf(deviation) > cur_max_deviation)
cur_max_deviation = fabsf(deviation);
}
return cur_max_deviation;
}