Hello Ceph experts,
In the last day or so, we had a few nodes randomly reboot and now unfound objects are
reported in Ceph health during cluster during recovery.
It appears that the object in question is a hit set object, which I now cannot mark lost
because Ceph cannot probe the OSDs that keep crashing due to missing the hit set object.
Pasted below is the crash message[1] for osd.299, and some of the unfound objects[2].
Lastly [3] shows a sample of the hit set objects that are lost.
I would greatly appreciate any insight you may have on how to move forward. As of right
now this cluster is inoperable due to 3 down PGs.
Thanks,
Lincoln Bryant
[1]
-4> 2020-02-26 22:26:29.455 7ff52edaa700 0 0x559587fa91e0 36.321b unexpected need
for 36:d84c0000:.ceph-internal::hit_set_36.321b_archive_2020-02-24
21%3a15%3a16.792846_2020-02-24 21%3a15%3a32.457855:head have 1352209'2834660 flags =
none tried to add 1352209'2834660 flags = none
-3> 2020-02-26 22:26:29.455 7ff52edaa700 0 0x559587fa91e0 36.321b unexpected need
for 36:d84c0000:.ceph-internal::hit_set_36.321b_archive_2020-02-24
21%3a15%3a16.792846_2020-02-24 21%3a15%3a32.457855:head have 1352209'2834660 flags =
none tried to add 1359781'2835659 flags = delete
-2> 2020-02-26 22:26:29.456 7ff53adc2700 3 osd.299 1367392 handle_osd_map epochs
[1367392,1367392], i have 1367392, src has [1349017,1367392]
-1> 2020-02-26 22:26:29.460 7ff52edaa700 -1
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.7/rpm/el7/BUILD/ceph-14.2.7/src/osd/PG.h:
In function 'void PG::MissingLoc::add_active_missing(const pg_missing_t&)'
thread 7ff52edaa700 time 2020-02-26 22:26:29.457170
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.7/rpm/el7/BUILD/ceph-14.2.7/src/osd/PG.h:
838: FAILED ceph_assert(i->second.need == j->second.need)
ceph version 14.2.7 (3d58626ebeec02d8385a4cefb92c6cbc3a45bfe8) nautilus (stable)
1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x14a)
[0x55955fdafc0f]
2: (()+0x4dddd7) [0x55955fdafdd7]
3: (PG::MissingLoc::add_active_missing(pg_missing_set<false> const&)+0x1e0)
[0x55955ffa0cb0]
4: (PG::activate(ObjectStore::Transaction&, unsigned int, std::map<int,
std::map<spg_t, pg_query_t, std::less<spg_t>,
std::allocator<std::pair<spg_t const, pg_query_t> > >,
std::less<int>, std::allocator<std::pair<int const, std::map<spg_t,
pg_query_t, std::less<spg_t>, std::allocator<std::pair<spg_t const,
pg_query_t> > > > > >&, std::map<int,
std::vector<std::pair<pg_notify_t, PastIntervals>,
std::allocator<std::pair<pg_notify_t, PastIntervals> > >,
std::less<int>, std::allocator<std::pair<int const,
std::vector<std::pair<pg_notify_t, PastIntervals>,
std::allocator<std::pair<pg_notify_t, PastIntervals> > > > > >*,
PG::RecoveryCtx*)+0x1916) [0x55955ff3f1e6]
5:
(PG::RecoveryState::Active::Active(boost::statechart::state<PG::RecoveryState::Active,
PG::RecoveryState::Primary, PG::RecoveryState::Activating,
(boost::statechart::history_mode)0>::my_context)+0x370) [0x55955ff62d20]
6: (boost::statechart::simple_state<PG::RecoveryState::Peering,
PG::RecoveryState::Primary, PG::RecoveryState::GetInfo,
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
const&, void const*)+0xfb) [0x55955ffa8d5b]
7: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine,
PG::RecoveryState::Initial, std::allocator<void>,
boost::statechart::null_exception_translator>::process_queued_events()+0x97)
[0x55955ff88507]
8: (PG::handle_activate_map(PG::RecoveryCtx*)+0x1a8) [0x55955ff75848]
9: (OSD::advance_pg(unsigned int, PG*, ThreadPool::TPHandle&,
PG::RecoveryCtx*)+0x61d) [0x55955feb161d]
10: (OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>,
ThreadPool::TPHandle&)+0xa6) [0x55955feb2d16]
11: (PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&,
ThreadPool::TPHandle&)+0x51) [0x55956011a481]
12: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90f)
[0x55955fea7bbf]
13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) [0x559560448976]
14: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55956044b490]
15: (()+0x7e25) [0x7ff5669bae25]
16: (clone()+0x6d) [0x7ff565a9a34d]
0> 2020-02-26 22:26:29.465 7ff52edaa700 -1 *** Caught signal (Aborted) **
in thread 7ff52edaa700 thread_name:tp_osd_tp
ceph version 14.2.7 (3d58626ebeec02d8385a4cefb92c6cbc3a45bfe8) nautilus (stable)
1: (()+0xf5e0) [0x7ff5669c25e0]
2: (gsignal()+0x37) [0x7ff5659d71f7]
3: (abort()+0x148) [0x7ff5659d88e8]
4: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199)
[0x55955fdafc5e]
5: (()+0x4dddd7) [0x55955fdafdd7]
6: (PG::MissingLoc::add_active_missing(pg_missing_set<false> const&)+0x1e0)
[0x55955ffa0cb0]
7: (PG::activate(ObjectStore::Transaction&, unsigned int, std::map<int,
std::map<spg_t, pg_query_t, std::less<spg_t>,
std::allocator<std::pair<spg_t const, pg_query_t> > >,
std::less<int>, std::allocator<std::pair<int const, std::map<spg_t,
pg_query_t, std::less<spg_t>, std::allocator<std::pair<spg_t const,
pg_query_t> > > > > >&, std::map<int,
std::vector<std::pair<pg_notify_t, PastIntervals>,
std::allocator<std::pair<pg_notify_t, PastIntervals> > >,
std::less<int>, std::allocator<std::pair<int const,
std::vector<std::pair<pg_notify_t, PastIntervals>,
std::allocator<std::pair<pg_notify_t, PastIntervals> > > > > >*,
PG::RecoveryCtx*)+0x1916) [0x55955ff3f1e6]
8:
(PG::RecoveryState::Active::Active(boost::statechart::state<PG::RecoveryState::Active,
PG::RecoveryState::Primary, PG::RecoveryState::Activating,
(boost::statechart::history_mode)0>::my_context)+0x370) [0x55955ff62d20]
9: (boost::statechart::simple_state<PG::RecoveryState::Peering,
PG::RecoveryState::Primary, PG::RecoveryState::GetInfo,
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
const&, void const*)+0xfb) [0x55955ffa8d5b]
10: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine,
PG::RecoveryState::Initial, std::allocator<void>,
boost::statechart::null_exception_translator>::process_queued_events()+0x97)
[0x55955ff88507]
11: (PG::handle_activate_map(PG::RecoveryCtx*)+0x1a8) [0x55955ff75848]
12: (OSD::advance_pg(unsigned int, PG*, ThreadPool::TPHandle&,
PG::RecoveryCtx*)+0x61d) [0x55955feb161d]
13: (OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>,
ThreadPool::TPHandle&)+0xa6) [0x55955feb2d16]
14: (PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&,
ThreadPool::TPHandle&)+0x51) [0x55956011a481]
15: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90f)
[0x55955fea7bbf]
16: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) [0x559560448976]
17: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55956044b490]
18: (()+0x7e25) [0x7ff5669bae25]
19: (clone()+0x6d) [0x7ff565a9a34d]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to
interpret this.
--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
1/ 5 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 journaler
0/ 5 objectcacher
0/ 5 client
1/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore
1/ 3 journal
0/ 0 ms
1/ 5 mon
0/10 monc
1/ 5 paxos
0/ 5 tp
1/ 5 auth
1/ 5 crypto
1/ 1 finisher
1/ 1 reserver
1/ 5 heartbeatmap
1/ 5 perfcounter
1/ 5 rgw
1/ 5 rgw_sync
1/10 civetweb
1/ 5 javaclient
1/ 5 asok
1/ 1 throttle
0/ 0 refs
1/ 5 xio
1/ 5 compressor
1/ 5 bluestore
1/ 5 bluefs
1/ 3 bdev
1/ 5 kstore
4/ 5 rocksdb
4/ 5 leveldb
4/ 5 memdb
1/ 5 kinetic
1/ 5 fuse
1/ 5 mgr
1/ 5 mgrc
1/ 5 dpdk
1/ 5 eventtrace
1/ 5 prioritycache
-2/-2 (syslog threshold)
-1/-1 (stderr threshold)
max_recent 10000
max_new 1000
log_file /var/log/ceph/ceph-osd.299.log
--- end dump of recent events ---
[2]
[root@ceph-mon01 ~]# ceph pg 36.321b list_unfound
{
"num_missing": 1,
"num_unfound": 1,
"objects": [
{
"oid": {
"oid": "hit_set_36.321b_archive_2020-02-24
21:15:16.792846_2020-02-24 21:15:32.457855",
"key": "",
"snapid": -2,
"hash": 12827,
"max": 0,
"pool": 36,
"namespace": ".ceph-internal"
},
"need": "1352209'2834660",
"have": "0'0",
"flags": "none",
"locations": []
}
],
"more": false
}
[root@ceph-mon01 ~]# ceph pg 36.324a list_unfound
{
"num_missing": 1,
"num_unfound": 1,
"objects": [
{
"oid": {
"oid": "hit_set_36.324a_archive_2020-02-25
12:40:58.130723_2020-02-25 12:46:25.260587",
"key": "",
"snapid": -2,
"hash": 12874,
"max": 0,
"pool": 36,
"namespace": ".ceph-internal"
},
"need": "1361100'2822063",
"have": "0'0",
"flags": "none",
"locations": []
}
],
"more": false
}
[root@ceph-mon01 ~]# ceph pg 36.10dc list_unfound
{
"num_missing": 1,
"num_unfound": 1,
"objects": [
{
"oid": {
"oid": "hit_set_36.10dc_archive_2020-02-25
12:40:58.129048_2020-02-25 12:45:02.202268",
"key": "",
"snapid": -2,
"hash": 4316,
"max": 0,
"pool": 36,
"namespace": ".ceph-internal"
},
"need": "1361089'2838543",
"have": "0'0",
"flags": "none",
"locations": []
}
],
"more": false
}