Hi Wyll,


there should be some error description in OSD log prior to the assertion. Containing "bdev-read failed" substring. What reason is there?


Also suggest to check H/W errors with dmesg and/or smartctl since read errors tend to be H/W issues..


Thanks,

Igor


On 6/3/2020 10:42 PM, Wyll Ingersoll wrote:
One of our bluestore OSD daemons started crashing pretty regularly during scrub recently. Here is the crash info from one of dumps. Not sure what do to (if anything) about it. 

thanks,
   Wyllys Ingersoll


{

    "os_version_id": "16.04", 

    "assert_condition": "r == 0", 

    "utsname_release": "4.19.34-041934-generic", 

    "os_name": "Ubuntu", 

    "entity_name": "osd.59", 

    "assert_file": "/build/ceph-14.2.9/src/os/bluestore/BlueStore.cc", 

    "timestamp": "2020-05-31 12:01:11.624651Z", 

    "process_name": "ceph-osd", 

    "utsname_machine": "x86_64", 

    "assert_line": 9214, 

    "utsname_sysname": "Linux", 

    "os_version": "16.04.4 LTS (Xenial Xerus)", 

    "os_id": "ubuntu", 

    "assert_thread_name": "tp_osd_tp", 

    "utsname_version": "#201904051741 SMP Fri Apr 5 21:43:27 UTC 2019", 

    "backtrace": [

        "(()+0x11390) [0x7f3699516390]", 

        "(gsignal()+0x38) [0x7f3698a41428]", 

        "(abort()+0x16a) [0x7f3698a4302a]", 

        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x83c977]", 

        "(ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char const*, ...)+0) [0x83cb01]", 

        "(BlueStore::_do_read(BlueStore::Collection*, boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long, ceph::buffer::v14_2_0::list&, unsigned int, unsigned long)+0x2cfc) [0xd7a3fc]", 

        "(BlueStore::read(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ghobject_t const&, unsigned long, unsigned long, ceph::buffer::v14_2_0::list&, unsigned int)+0x1bb) [0xd7fc9b]", 

        "(ReplicatedBackend::be_deep_scrub(hobject_t const&, ScrubMap&, ScrubMapBuilder&, ScrubMap::object&)+0x2d2) [0xbdcbb2]", 

        "(PGBackend::be_scan_list(ScrubMap&, ScrubMapBuilder&)+0x393) [0xaf2a73]", 

        "(PG::build_scrub_map_chunk(ScrubMap&, ScrubMapBuilder&, hobject_t, hobject_t, bool, ThreadPool::TPHandle&)+0x7b) [0x9824db]", 

        "(PG::chunky_scrub(ThreadPool::TPHandle&)+0x1733) [0x9b24d3]", 

        "(PG::scrub(unsigned int, ThreadPool::TPHandle&)+0xaf) [0x9b35bf]", 

        "(PGScrub::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x1a) [0xb781da]", 

        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xbed) [0x8dd11d]", 

        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x4ac) [0xee22ac]", 

        "(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xee5470]", 

        "(()+0x76ba) [0x7f369950c6ba]", 

        "(clone()+0x6d) [0x7f3698b1341d]"

    ], 

    "utsname_hostname": "ss005", 

    "assert_msg": "/build/ceph-14.2.9/src/os/bluestore/BlueStore.cc: In function 'int BlueStore::_do_read(BlueStore::Collection*, BlueStore::OnodeRef, uint64_t, size_t, ceph::bufferlist&, uint32_t, uint64_t)' thread 7f3675040700 time 2020-05-31 08:01:11.613149\n/build/ceph-14.2.9/src/os/bluestore/BlueStore.cc: 9214: FAILED ceph_assert(r == 0)\n", 

    "crash_id": "2020-05-31_12:01:11.624651Z_ed9870b4-bb20-4c95-af6a-19a16cde5b18", 

    "assert_func": "int BlueStore::_do_read(BlueStore::Collection*, BlueStore::OnodeRef, uint64_t, size_t, ceph::bufferlist&, uint32_t, uint64_t)", 

    "ceph_version": "14.2.9"

}




_______________________________________________
Dev mailing list -- dev@ceph.io
To unsubscribe send an email to dev-leave@ceph.io