Hi Luke,
highly likely this is caused by the issue covered
https://tracker.ceph.com/issues/53906
Unfortunately it looks like we missed proper backport in Pacific.
You can apparently work around the issue by setting
'bluestore_volume_selection_policy' config parameter to rocksdb_original.
The potential implication of that "tuning" is a less effective free
space usage for DB volume - RocksDB/BlueFS might initiate data spillover
to main (slow) device despite having available free space at standalone
DB volume. Which in turn might cause some performance regression.
Relevant alert will pop up if such a spillover takes place .
The above consequences are not highly likely to occur though. And they
are rather minor most of the time so I would encourage you to try that
if OSD crashes are that common.
Thanks,
Igor
On 21/09/2023 17:48, Luke Hall wrote:
> Hi,
>
> Since the recent update to 16.2.14-1~bpo11+1 on Debian Bullseye I've
> started seeing OSD crashes being registered almost daily across all
> six physical machines (6xOSD disks per machine). There's a --block-db
> for each osd on a LV from an NVMe.
>
> If anyone has any idea what might be causing these I'd appreciate some
> insight. Happy to provide any other info which might be useful.
>
> Thanks,
>
> Luke
>
>
>
> {
> "assert_condition": "cur2 >= p.length",
> "assert_file": "./src/os/bluestore/BlueStore.h",
> "assert_func": "virtual void
> RocksDBBlueFSVolumeSelector::sub_usage(void*, const bluefs_fnode_t&)",
> "assert_line": 3875,
> "assert_msg": "./src/os/bluestore/BlueStore.h: In function
> 'virtual void RocksDBBlueFSVolumeSelector::sub_usage(void*, const
> bluefs_fnode_t&)' thread 7f7f54f25700 time
> 2023-09-20T14:24:00.455721+0100\n./src/os/bluestore/BlueStore.h: 3875:
> FAILED ceph_assert(cur2 >= p.length)\n",
> "assert_thread_name": "bstore_kv_sync",
> "backtrace": [
> "/lib/x86_64-linux-gnu/libpthread.so.0(+0x13140)
> [0x7f7f68632140]",
> "gsignal()",
> "abort()",
> "(ceph::__ceph_assert_fail(char const*, char const*, int, char
> const*)+0x16e) [0x55b22a49b5fa]",
> "/usr/bin/ceph-osd(+0xac673b) [0x55b22a49b73b]",
> "(RocksDBBlueFSVolumeSelector::sub_usage(void*, bluefs_fnode_t
> const&)+0x11e) [0x55b22ab0077e]",
> "(BlueFS::_flush_range_F(BlueFS::FileWriter*, unsigned long,
> unsigned long)+0x5bd) [0x55b22ab9b8ed]",
> "(BlueFS::_flush_F(BlueFS::FileWriter*, bool, bool*)+0x9a)
> [0x55b22ab9bd7a]",
> "(BlueFS::fsync(BlueFS::FileWriter*)+0x79) [0x55b22aba97a9]",
> "(BlueRocksWritableFile::Sync()+0x15) [0x55b22abbf405]",
> "(rocksdb::LegacyWritableFileWrapper::Sync(rocksdb::IOOptions const&,
> rocksdb::IODebugContext*)+0x3f) [0x55b22b0914d1]",
> "(rocksdb::WritableFileWriter::SyncInternal(bool)+0x1f4)
> [0x55b22b26b7c6]",
> "(rocksdb::WritableFileWriter::Sync(bool)+0x18c)
> [0x55b22b26b1f8]",
> "(rocksdb::DBImpl::WriteToWAL(rocksdb::WriteThread::WriteGroup const&,
> rocksdb::log::Writer*, unsigned long*, bool, bool, unsigned
> long)+0x366) [0x55b22b0e4a98]",
> "(rocksdb::DBImpl::WriteImpl(rocksdb::WriteOptions const&,
> rocksdb::WriteBatch*, rocksdb::WriteCallback*, unsigned long*,
> unsigned long, bool, unsigned long*, unsigned long,
> rocksdb::PreReleaseCallback*)+0x12cc) [0x55b22b0e0c5a]",
> "(rocksdb::DBImpl::Write(rocksdb::WriteOptions const&,
> rocksdb::WriteBatch*)+0x4a) [0x55b22b0df92a]",
> "(RocksDBStore::submit_common(rocksdb::WriteOptions&,
> std::shared_ptr<KeyValueDB::TransactionImpl>)+0x82) [0x55b22b036c42]",
>
>
"(RocksDBStore::submit_transaction_sync(std::shared_ptr<KeyValueDB::TransactionImpl>)+0x96)
> [0x55b22b037cc6]",
> "(BlueStore::_kv_sync_thread()+0x1201) [0x55b22aafc891]",
> "(BlueStore::KVSyncThread::entry()+0xd) [0x55b22ab2792d]",
> "/lib/x86_64-linux-gnu/libpthread.so.0(+0x7ea7)
> [0x7f7f68626ea7]",
> "clone()"
> ],
> "ceph_version": "16.2.14",
> "crash_id":
> "2023-09-20T13:24:00.562318Z_beb5c664-9ffb-4a4e-8c61-166865fd4e0b",
> "entity_name": "osd.8",
> "os_id": "11",
> "os_name": "Debian GNU/Linux 11 (bullseye)",
> "os_version": "11 (bullseye)",
> "os_version_id": "11",
> "process_name": "ceph-osd",
> "stack_sig":
> "90d1fb6954f0f5b1e98659a93a1b9ce5a5a42cd5e0b2990a65dc336567adcb26",
> "timestamp": "2023-09-20T13:24:00.562318Z",
> "utsname_hostname": "cphosd02",
> "utsname_machine": "x86_64",
> "utsname_release": "5.10.0-23-amd64",
> "utsname_sysname": "Linux",
> "utsname_version": "#1 SMP Debian 5.10.179-1 (2023-05-12)"
> }
>
>