Hello, lists.
I have a 108 OSD ceph cluster. All OSDs work fine except one OSD-86.
ceph-osd(a)86.service stopped working at a random time.
The disk is normal by checking with `smarctl -a`.
It could be fine for a few days after I restart it. Then it goes wrong
again.
I paste the related log here. It stopped at 05:26 UTC.
---
2023-02-17T05:26:37.795+0000 7ff525846700 0 log_channel(cluster) log [DBG]
: 17.df scrub starts
2023-02-17T05:26:37.799+0000 7ff525846700 0 log_channel(cluster) log [DBG]
: 17.df scrub ok
2023-02-17T05:26:38.779+0000 7ff527049700 0 log_channel(cluster) log [DBG]
: 2.64 scrub starts
2023-02-17T05:26:38.783+0000 7ff527049700 0 log_channel(cluster) log [DBG]
: 2.64 scrub ok
2023-02-17T05:26:38.871+0000 7ff526848700 1 osd.86 pg_epoch: 113734
pg[20.115( v 113733'56242916 (113711'56240668,113733'56242916]
local-lis/les=113726/113727 n=1113 ec=440/440 lis/c=113726/113726
les/c/f=113727/113727/0 sis=113734) [105,86,97] r=1 lpr=113734
pi=[113726,113734)/1 luod=0'0 lua=113730'56242903 crt=113733'56242916 lcod
113733'56242915 mlcod 0'0 active mbc={}] start_peering_interval up
[105,86,97] -> [105,86,97], acting [105,97] -> [105,86,97], acting_primary
105 -> 105, up_primary 105 -> 105, role -1 -> 1, features acting
4540138292840890367 upacting 4540138292840890367
2023-02-17T05:26:38.871+0000 7ff526848700 1 osd.86 pg_epoch: 113734
pg[20.115( v 113733'56242916 (113711'56240668,113733'56242916]
local-lis/les=113726/113727 n=1113 ec=440/440 lis/c=113726/113726
les/c/f=113727/113727/0 sis=113734) [105,86,97] r=1 lpr=113734
pi=[113726,113734)/1 crt=113733'56242916 lcod 113733'56242915 mlcod 0'0
unknown NOTIFY mbc={}] state<Start>: transitioning to Stray
2023-02-17T05:26:55.075+0000 7ff52784a700 -1 *** Caught signal
(Segmentation fault) **
in thread 7ff52784a700 thread_name:tp_osd_tp
ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus
(stable)
1: (()+0x14420) [0x7ff54448a420]
2: (BlueStore::ExtentMap::decode_some(ceph::buffer::v15_2_0::list&)+0x31d)
[0x561eeca36ebd]
3: (BlueStore::ExtentMap::fault_range(KeyValueDB*, unsigned int, unsigned
int)+0x241) [0x561eeca3de21]
4: (BlueStore::_do_read(BlueStore::Collection*,
boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long,
ceph::buffer::v15_2_0::list&, unsigned int, unsigned long)+0x153)
[0x561eeca4ae53]
5: (BlueStore::read(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
ghobject_t const&, unsigned long, unsigned long,
ceph::buffer::v15_2_0::list&, unsigned int)+0x233) [0x561eeca4bf63]
6: (ReplicatedBackend::be_deep_scrub(hobject_t const&, ScrubMap&,
ScrubMapBuilder&, ScrubMap::object&)+0x2b5) [0x561eec873235]
7: (PGBackend::be_scan_list(ScrubMap&, ScrubMapBuilder&)+0x35f)
[0x561eec6f2b6f]
8: (PG::build_scrub_map_chunk(ScrubMap&, ScrubMapBuilder&, hobject_t,
hobject_t, bool, ThreadPool::TPHandle&)+0x8b) [0x561eec5aa00b]
9: (PG::chunky_scrub(ThreadPool::TPHandle&)+0x14c8) [0x561eec5bc648]
10: (PG::scrub(unsigned int, ThreadPool::TPHandle&)+0x31b) [0x561eec5be67b]
11: (ceph::osd::scheduler::PGScrub::run(OSD*, OSDShard*,
boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x16) [0x561eec7876b6]
12: (OSD::ShardedOpWQ::_process(unsigned int,
ceph::heartbeat_handle_d*)+0x4db) [0x561eec51724b]
13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x403)
[0x561eecbd5353]
14: (ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x561eecbd8154]
15: (()+0x8609) [0x7ff54447e609]
16: (clone()+0x43) [0x7ff5443a3133]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed
to interpret this.
--- begin dump of recent events ---
-7193> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command assert hook 0x561ef68ea610
-7192> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command abort hook 0x561ef68ea610
-7191> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command leak_some_memory hook 0x561ef68ea610
-7190> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perfcounters_dump hook 0x561ef68ea610
-7189> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command 1 hook 0x561ef68ea610
-7188> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perf dump hook 0x561ef68ea610
-7187> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perfcounters_schema hook 0x561ef68ea610
-7186> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perf histogram dump hook 0x561ef68ea610
-7185> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command 2 hook 0x561ef68ea610
-7184> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perf schema hook 0x561ef68ea610
-7183> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perf histogram schema hook 0x561ef68ea610
-7182> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command perf reset hook 0x561ef68ea610
-7181> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config show hook 0x561ef68ea610
-7180> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config help hook 0x561ef68ea610
-7179> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config set hook 0x561ef68ea610
-7178> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config unset hook 0x561ef68ea610
-7177> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config get hook 0x561ef68ea610
-7176> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config diff hook 0x561ef68ea610
-7175> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command config diff get hook 0x561ef68ea610
-7174> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command injectargs hook 0x561ef68ea610
-7173> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command log flush hook 0x561ef68ea610
-7172> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command log dump hook 0x561ef68ea610
-7171> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command log reopen hook 0x561ef68ea610
-7170> 2023-02-17T05:26:23.928+0000 7ff5440ded80 5 asok(0x561ef6990000)
register_command dump_mempools hook 0x561ef7568068
-7169> 2023-02-17T05:26:23.936+0000 7ff5440ded80 10 monclient:
get_monmap_and_config
-7168> 2023-02-17T05:26:23.936+0000 7ff5440ded80 10 monclient:
build_initial_monmap
-7167> 2023-02-17T05:26:23.936+0000 7ff5440ded80 10 monclient: monmap:
epoch 0
--
-50> 2023-02-17T05:26:44.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:14.771302+0000)
-49> 2023-02-17T05:26:44.771+0000 7ff52103d700 5 osd.86 113735
heartbeat osd_stat(store_statfs(0x35e3cac2000/0x40000000/0x3aac7ffe000,
data 0x45247e7e86/0x450493c000, compress 0x0/0x0/0x0, omap 0x155e40, meta
0x3feaa1c0), peers
[0,9,11,12,13,15,17,18,19,21,22,24,26,27,30,31,33,34,37,39,40,43,46,49,51,55,56,57,60,61,62,64,65,66,67,69,70,71,73,78,79,80,82,83,84,85,87,88,89,91,92,93,94,96,97,100,101,102,103,105,106,107]
op hist [])
-48> 2023-02-17T05:26:45.179+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 550584320 unmapped: 1384448 heap:
551968768 old mem: 2845415832 new mem: 2845415832
-47> 2023-02-17T05:26:45.447+0000 7ff5430b4700 10 monclient:
handle_auth_request added challenge on 0x561f1606f000
-46> 2023-02-17T05:26:45.767+0000 7ff535866700 10 monclient: tick
-45> 2023-02-17T05:26:45.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:15.771503+0000)
-44> 2023-02-17T05:26:46.183+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 550805504 unmapped: 1163264 heap:
551968768 old mem: 2845415832 new mem: 2845415832
-43> 2023-02-17T05:26:46.579+0000 7ff5428b3700 10 monclient:
handle_auth_request added challenge on 0x561f1606ec00
-42> 2023-02-17T05:26:46.579+0000 7ff536868700 2 osd.86 113735
ms_handle_reset con 0x561f1606ec00 session 0x561f166c6f00
-41> 2023-02-17T05:26:46.767+0000 7ff535866700 10 monclient: tick
-40> 2023-02-17T05:26:46.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:16.771672+0000)
-39> 2023-02-17T05:26:47.075+0000 7ff52103d700 5 osd.86 113735
heartbeat osd_stat(store_statfs(0x35e3cac2000/0x40000000/0x3aac7ffe000,
data 0x45247e7e86/0x450493c000, compress 0x0/0x0/0x0, omap 0x155e40, meta
0x3feaa1c0), peers
[0,9,11,12,13,15,17,18,19,21,22,24,26,27,30,31,33,34,37,39,40,43,46,49,51,55,56,57,60,61,62,64,65,66,67,69,70,71,73,78,79,80,82,83,84,85,87,88,89,91,92,93,94,96,97,100,101,102,103,105,106,107]
op hist [])
-38> 2023-02-17T05:26:47.183+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 553959424 unmapped: 1155072 heap:
555114496 old mem: 2845415832 new mem: 2845415832
-37> 2023-02-17T05:26:47.183+0000 7ff537069700 5
bluestore.MempoolThread(0x561ef7616a68) _resize_shards cache_size:
2845415832 kv_alloc: 1140850688 kv_used: 110273280 meta_alloc: 1023410176
meta_used: 2977563 data_alloc: 654311424 data_used: 0
-36> 2023-02-17T05:26:47.575+0000 7ff52103d700 5 osd.86 113735
heartbeat osd_stat(store_statfs(0x35e3cac2000/0x40000000/0x3aac7ffe000,
data 0x45247e7e86/0x450493c000, compress 0x0/0x0/0x0, omap 0x155e40, meta
0x3feaa1c0), peers
[0,9,11,12,13,15,17,18,19,21,22,24,26,27,30,31,33,34,37,39,40,43,46,49,51,55,56,57,60,61,62,64,65,66,67,69,70,71,73,78,79,80,82,83,84,85,87,88,89,91,92,93,94,96,97,100,101,102,103,105,106,107]
op hist [])
-35> 2023-02-17T05:26:47.767+0000 7ff535866700 10 monclient: tick
-34> 2023-02-17T05:26:47.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:17.771928+0000)
-33> 2023-02-17T05:26:48.223+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 557449216 unmapped: 811008 heap:
558260224 old mem: 2845415832 new mem: 2845415832
-32> 2023-02-17T05:26:48.699+0000 7ff5428b3700 10 monclient:
handle_auth_request added challenge on 0x561efca9f000
-31> 2023-02-17T05:26:48.767+0000 7ff535866700 10 monclient: tick
-30> 2023-02-17T05:26:48.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:18.772104+0000)
-29> 2023-02-17T05:26:49.227+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 560799744 unmapped: 606208 heap:
561405952 old mem: 2845415832 new mem: 2845415832
-28> 2023-02-17T05:26:49.275+0000 7ff52103d700 5 osd.86 113735
heartbeat osd_stat(store_statfs(0x35e3cac0000/0x40000000/0x3aac7ffe000,
data 0x45247e7e86/0x450493c000, compress 0x0/0x0/0x0, omap 0x155e40, meta
0x3feaa1c0), peers
[0,9,11,12,13,15,17,18,19,21,22,24,26,27,30,31,33,34,37,39,40,43,46,49,51,55,56,57,60,61,62,64,65,66,67,69,70,71,73,78,79,80,82,83,84,85,87,88,89,91,92,93,94,96,97,100,101,102,103,105,106,107]
op hist [])
-27> 2023-02-17T05:26:49.367+0000 7ff5438b5700 10 monclient:
handle_auth_request added challenge on 0x561f13c69000
-26> 2023-02-17T05:26:49.767+0000 7ff535866700 10 monclient: tick
-25> 2023-02-17T05:26:49.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:19.772303+0000)
-24> 2023-02-17T05:26:50.231+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 565821440 unmapped: 827392 heap:
566648832 old mem: 2845415832 new mem: 2845415832
-23> 2023-02-17T05:26:50.295+0000 7ff5430b4700 10 monclient:
handle_auth_request added challenge on 0x561efca9f400
-22> 2023-02-17T05:26:50.767+0000 7ff535866700 10 monclient: tick
-21> 2023-02-17T05:26:50.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:20.772449+0000)
-20> 2023-02-17T05:26:51.231+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 570171392 unmapped: 671744 heap:
570843136 old mem: 2845415832 new mem: 2845415832
-19> 2023-02-17T05:26:51.767+0000 7ff535866700 10 monclient: tick
-18> 2023-02-17T05:26:51.767+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:21.772614+0000)
-17> 2023-02-17T05:26:51.803+0000 7ff5428b3700 10 monclient:
handle_auth_request added challenge on 0x561f13c68400
-16> 2023-02-17T05:26:52.123+0000 7ff53205f700 5
bluestore(/var/lib/ceph/osd/ceph-86) _kv_sync_thread utilization: idle
9.937826090s of 10.006035168s, submitted: 179
-15> 2023-02-17T05:26:52.183+0000 7ff537069700 5
bluestore.MempoolThread(0x561ef7616a68) _resize_shards cache_size:
2845415832 kv_alloc: 1140850688 kv_used: 113276944 meta_alloc: 1040187392
meta_used: 16149983 data_alloc: 654311424 data_used: 0
-14> 2023-02-17T05:26:52.247+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 574758912 unmapped: 278528 heap:
575037440 old mem: 2845415832 new mem: 2845415832
-13> 2023-02-17T05:26:52.343+0000 7ff5438b5700 10 monclient:
handle_auth_request added challenge on 0x561f13c68000
-12> 2023-02-17T05:26:52.539+0000 7ff5430b4700 10 monclient:
handle_auth_request added challenge on 0x561f167b4800
-11> 2023-02-17T05:26:52.555+0000 7ff5428b3700 10 monclient:
handle_auth_request added challenge on 0x561f14eb4800
-10> 2023-02-17T05:26:52.771+0000 7ff535866700 10 monclient: tick
-9> 2023-02-17T05:26:52.771+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:22.772842+0000)
-8> 2023-02-17T05:26:52.775+0000 7ff52103d700 5 osd.86 113735
heartbeat osd_stat(store_statfs(0x35e3cabe000/0x40000000/0x3aac7ffe000,
data 0x45247e7e86/0x450493c000, compress 0x0/0x0/0x0, omap 0x155e40, meta
0x3feaa1c0), peers
[0,9,11,12,13,15,17,18,19,21,22,24,26,27,30,31,33,34,37,39,40,43,46,49,51,55,56,57,60,61,62,64,65,66,67,69,70,71,73,78,79,80,82,83,84,85,87,88,89,91,92,93,94,96,97,100,101,102,103,105,106,107]
op hist [])
-7> 2023-02-17T05:26:53.247+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 579764224 unmapped: 516096 heap:
580280320 old mem: 2845415832 new mem: 2845415832
-6> 2023-02-17T05:26:53.531+0000 7ff5438b5700 10 monclient:
handle_auth_request added challenge on 0x561f14eb4000
-5> 2023-02-17T05:26:53.771+0000 7ff535866700 10 monclient: tick
-4> 2023-02-17T05:26:53.771+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:23.773042+0000)
-3> 2023-02-17T05:26:54.251+0000 7ff537069700 5 prioritycache
tune_memory target: 4294967296 mapped: 583467008 unmapped: 1007616 heap:
584474624 old mem: 2845415832 new mem: 2845415832
-2> 2023-02-17T05:26:54.771+0000 7ff535866700 10 monclient: tick
-1> 2023-02-17T05:26:54.771+0000 7ff535866700 10 monclient:
_check_auth_rotating have uptodate secrets (they expire after
2023-02-17T05:26:24.773241+0000)
0> 2023-02-17T05:26:55.075+0000 7ff52784a700 -1 *** Caught signal
(Segmentation fault) **
in thread 7ff52784a700 thread_name:tp_osd_tp
ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus
(stable)
1: (()+0x14420) [0x7ff54448a420]
2: (BlueStore::ExtentMap::decode_some(ceph::buffer::v15_2_0::list&)+0x31d)
[0x561eeca36ebd]
3: (BlueStore::ExtentMap::fault_range(KeyValueDB*, unsigned int, unsigned
int)+0x241) [0x561eeca3de21]
4: (BlueStore::_do_read(BlueStore::Collection*,
boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long,
ceph::buffer::v15_2_0::list&, unsigned int, unsigned long)+0x153)
[0x561eeca4ae53]
5: (BlueStore::read(boost::intrusive_ptr<ObjectStore::CollectionImpl>&,
ghobject_t const&, unsigned long, unsigned long,
ceph::buffer::v15_2_0::list&, unsigned int)+0x233) [0x561eeca4bf63]
6: (ReplicatedBackend::be_deep_scrub(hobject_t const&, ScrubMap&,
ScrubMapBuilder&, ScrubMap::object&)+0x2b5) [0x561eec873235]
7: (PGBackend::be_scan_list(ScrubMap&, ScrubMapBuilder&)+0x35f)
[0x561eec6f2b6f]
8: (PG::build_scrub_map_chunk(ScrubMap&, ScrubMapBuilder&, hobject_t,
hobject_t, bool, ThreadPool::TPHandle&)+0x8b) [0x561eec5aa00b]
9: (PG::chunky_scrub(ThreadPool::TPHandle&)+0x14c8) [0x561eec5bc648]
10: (PG::scrub(unsigned int, ThreadPool::TPHandle&)+0x31b) [0x561eec5be67b]
11: (ceph::osd::scheduler::PGScrub::run(OSD*, OSDShard*,
boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x16) [0x561eec7876b6]
12: (OSD::ShardedOpWQ::_process(unsigned int,
ceph::heartbeat_handle_d*)+0x4db) [0x561eec51724b]
13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x403)
[0x561eecbd5353]
14: (ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x561eecbd8154]
15: (()+0x8609) [0x7ff54447e609]
16: (clone()+0x43) [0x7ff5443a3133]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed
to interpret this.
--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
1/ 5 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 rbd_rwl
0/ 5 journaler
0/ 5 objectcacher
0/ 5 immutable_obj_cache
0/ 5 client
1/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore