Hi
Thanks for the reply. Yeah, i restarted all of the mon servers, in
sequence, and yesterday just leader alone without any success.
Reports:
root@monb01:~# ceph report | grep committed
report 4002437698
"monmap_first_committed": 1,
"monmap_last_committed": 6,
"osdmap_first_committed": 67114,
"osdmap_last_committed": 72592,
"mdsmap_first_committed": 1,
"mdsmap_last_committed": 1,
"first_committed": 609225,
"last_committed": 609251,
"first_committed": 180754137,
"last_committed": 180754777,
root@monb01:~#
root@monb01:~# ceph report | jq .osdmap_clean_epochs
report 395175214
{
"min_last_epoch_clean": 72592,
"last_epoch_clean": {
"per_pool": [
{
"poolid": 0,
"floor": 72592
},
{
"poolid": 1,
"floor": 72592
},
{
"poolid": 2,
"floor": 72592
},
{
"poolid": 3,
"floor": 72592
},
{
"poolid": 4,
"floor": 72592
},
{
"poolid": 5,
"floor": 72592
},
{
"poolid": 26,
"floor": 72592
},
{
"poolid": 27,
"floor": 72592
},
{
"poolid": 28,
"floor": 72592
}
]
},
"osd_epochs": [
{
"id": 0,
"epoch": 72592
},
{
"id": 1,
"epoch": 72592
},
{
"id": 2,
"epoch": 72592
},
{
"id": 3,
"epoch": 72592
},
{
"id": 4,
"epoch": 72592
},
{
"id": 5,
"epoch": 72592
},
{
"id": 6,
"epoch": 72592
},
{
"id": 7,
"epoch": 72592
},
{
"id": 8,
"epoch": 72592
},
{
"id": 9,
"epoch": 72592
},
{
"id": 10,
"epoch": 72592
},
{
"id": 11,
"epoch": 72592
},
{
"id": 12,
"epoch": 72592
},
{
"id": 13,
"epoch": 72592
},
{
"id": 14,
"epoch": 72592
},
{
"id": 15,
"epoch": 72592
},
{
"id": 16,
"epoch": 72592
},
{
"id": 17,
"epoch": 72592
},
{
"id": 18,
"epoch": 72592
},
{
"id": 19,
"epoch": 72592
},
{
"id": 20,
"epoch": 72592
},
{
"id": 21,
"epoch": 72592
},
{
"id": 22,
"epoch": 72592
},
{
"id": 23,
"epoch": 72592
},
{
"id": 24,
"epoch": 72592
},
{
"id": 25,
"epoch": 72592
},
{
"id": 26,
"epoch": 72592
},
{
"id": 27,
"epoch": 72592
},
{
"id": 28,
"epoch": 72592
},
{
"id": 29,
"epoch": 72592
},
{
"id": 30,
"epoch": 72592
},
{
"id": 31,
"epoch": 72592
},
{
"id": 32,
"epoch": 72592
},
{
"id": 33,
"epoch": 72592
},
{
"id": 34,
"epoch": 72592
},
{
"id": 35,
"epoch": 72592
},
{
"id": 36,
"epoch": 72592
},
{
"id": 37,
"epoch": 72592
},
{
"id": 38,
"epoch": 72592
},
{
"id": 39,
"epoch": 72592
},
{
"id": 40,
"epoch": 72592
},
{
"id": 41,
"epoch": 72592
},
{
"id": 42,
"epoch": 72592
},
{
"id": 43,
"epoch": 72592
},
{
"id": 44,
"epoch": 72592
},
{
"id": 45,
"epoch": 72592
},
{
"id": 46,
"epoch": 72592
},
{
"id": 47,
"epoch": 72592
},
{
"id": 48,
"epoch": 72592
},
{
"id": 49,
"epoch": 72592
},
{
"id": 50,
"epoch": 72592
},
{
"id": 51,
"epoch": 72592
},
{
"id": 52,
"epoch": 72592
},
{
"id": 53,
"epoch": 72592
},
{
"id": 54,
"epoch": 72592
},
{
"id": 55,
"epoch": 72592
},
{
"id": 56,
"epoch": 72592
},
{
"id": 57,
"epoch": 72592
},
{
"id": 58,
"epoch": 72592
},
{
"id": 59,
"epoch": 72592
},
{
"id": 60,
"epoch": 72592
},
{
"id": 61,
"epoch": 72592
},
{
"id": 62,
"epoch": 72592
},
{
"id": 63,
"epoch": 72592
},
{
"id": 64,
"epoch": 72592
},
{
"id": 65,
"epoch": 72592
},
{
"id": 66,
"epoch": 72592
},
{
"id": 67,
"epoch": 72592
},
{
"id": 68,
"epoch": 72592
},
{
"id": 69,
"epoch": 72592
},
{
"id": 70,
"epoch": 72592
},
{
"id": 71,
"epoch": 72592
},
{
"id": 72,
"epoch": 72592
},
{
"id": 73,
"epoch": 72592
},
{
"id": 74,
"epoch": 72592
},
{
"id": 75,
"epoch": 72592
},
{
"id": 76,
"epoch": 72592
},
{
"id": 77,
"epoch": 72592
},
{
"id": 78,
"epoch": 72592
},
{
"id": 79,
"epoch": 72592
},
{
"id": 80,
"epoch": 72592
},
{
"id": 81,
"epoch": 72592
},
{
"id": 83,
"epoch": 72592
},
{
"id": 84,
"epoch": 72592
},
{
"id": 85,
"epoch": 72592
},
{
"id": 86,
"epoch": 72592
},
{
"id": 87,
"epoch": 72592
},
{
"id": 88,
"epoch": 72592
},
{
"id": 89,
"epoch": 72592
},
{
"id": 90,
"epoch": 72592
},
{
"id": 91,
"epoch": 72592
},
{
"id": 92,
"epoch": 72592
},
{
"id": 93,
"epoch": 72592
},
{
"id": 94,
"epoch": 72592
},
{
"id": 95,
"epoch": 72592
},
{
"id": 96,
"epoch": 72592
},
{
"id": 97,
"epoch": 72592
},
{
"id": 98,
"epoch": 72592
},
{
"id": 99,
"epoch": 72592
},
{
"id": 100,
"epoch": 72592
},
{
"id": 101,
"epoch": 72592
},
{
"id": 102,
"epoch": 72592
},
{
"id": 103,
"epoch": 72592
},
{
"id": 104,
"epoch": 72592
},
{
"id": 105,
"epoch": 72592
},
{
"id": 106,
"epoch": 72592
},
{
"id": 107,
"epoch": 72592
},
{
"id": 108,
"epoch": 72592
},
{
"id": 109,
"epoch": 72592
},
{
"id": 110,
"epoch": 72592
},
{
"id": 111,
"epoch": 72592
},
{
"id": 112,
"epoch": 72592
},
{
"id": 113,
"epoch": 72592
},
{
"id": 114,
"epoch": 72592
},
{
"id": 115,
"epoch": 72592
},
{
"id": 116,
"epoch": 72592
},
{
"id": 117,
"epoch": 72592
},
{
"id": 118,
"epoch": 72592
},
{
"id": 119,
"epoch": 72592
},
{
"id": 120,
"epoch": 72592
},
{
"id": 121,
"epoch": 72592
},
{
"id": 122,
"epoch": 72592
},
{
"id": 123,
"epoch": 72592
},
{
"id": 124,
"epoch": 72592
},
{
"id": 125,
"epoch": 72592
},
{
"id": 126,
"epoch": 72592
},
{
"id": 127,
"epoch": 72592
},
{
"id": 128,
"epoch": 72592
},
{
"id": 129,
"epoch": 72592
},
{
"id": 130,
"epoch": 72592
},
{
"id": 131,
"epoch": 72592
},
{
"id": 132,
"epoch": 72592
},
{
"id": 133,
"epoch": 72592
},
{
"id": 134,
"epoch": 72592
},
{
"id": 135,
"epoch": 72592
},
{
"id": 136,
"epoch": 72592
},
{
"id": 137,
"epoch": 72592
},
{
"id": 138,
"epoch": 72592
},
{
"id": 139,
"epoch": 72592
},
{
"id": 140,
"epoch": 72592
},
{
"id": 141,
"epoch": 72592
},
{
"id": 142,
"epoch": 72592
},
{
"id": 143,
"epoch": 72592
},
{
"id": 144,
"epoch": 72592
},
{
"id": 145,
"epoch": 72592
},
{
"id": 146,
"epoch": 72592
},
{
"id": 147,
"epoch": 72592
},
{
"id": 148,
"epoch": 72592
},
{
"id": 149,
"epoch": 72592
},
{
"id": 150,
"epoch": 72592
},
{
"id": 151,
"epoch": 72592
},
{
"id": 152,
"epoch": 72592
},
{
"id": 153,
"epoch": 72592
},
{
"id": 154,
"epoch": 72592
},
{
"id": 155,
"epoch": 72592
},
{
"id": 156,
"epoch": 72592
},
{
"id": 157,
"epoch": 72592
},
{
"id": 158,
"epoch": 72592
},
{
"id": 159,
"epoch": 72592
},
{
"id": 160,
"epoch": 72592
},
{
"id": 161,
"epoch": 72592
},
{
"id": 162,
"epoch": 72592
},
{
"id": 163,
"epoch": 72592
},
{
"id": 164,
"epoch": 72592
},
{
"id": 165,
"epoch": 72592
},
{
"id": 166,
"epoch": 72592
},
{
"id": 167,
"epoch": 72592
},
{
"id": 168,
"epoch": 72592
},
{
"id": 169,
"epoch": 72592
},
{
"id": 170,
"epoch": 72592
},
{
"id": 171,
"epoch": 72592
},
{
"id": 172,
"epoch": 72592
},
{
"id": 173,
"epoch": 72592
},
{
"id": 174,
"epoch": 72592
},
{
"id": 175,
"epoch": 72592
},
{
"id": 176,
"epoch": 72592
},
{
"id": 177,
"epoch": 72592
},
{
"id": 178,
"epoch": 72592
},
{
"id": 179,
"epoch": 72592
},
{
"id": 180,
"epoch": 72592
},
{
"id": 181,
"epoch": 72592
},
{
"id": 182,
"epoch": 72592
},
{
"id": 183,
"epoch": 72592
},
{
"id": 184,
"epoch": 72592
},
{
"id": 185,
"epoch": 72592
},
{
"id": 186,
"epoch": 72592
},
{
"id": 187,
"epoch": 72592
},
{
"id": 188,
"epoch": 72592
},
{
"id": 189,
"epoch": 72592
},
{
"id": 190,
"epoch": 72592
},
{
"id": 191,
"epoch": 72592
},
{
"id": 192,
"epoch": 72592
},
{
"id": 193,
"epoch": 72592
},
{
"id": 194,
"epoch": 72592
},
{
"id": 195,
"epoch": 72592
},
{
"id": 196,
"epoch": 72592
},
{
"id": 197,
"epoch": 72592
},
{
"id": 198,
"epoch": 72592
},
{
"id": 199,
"epoch": 72592
},
{
"id": 200,
"epoch": 72592
},
{
"id": 201,
"epoch": 72592
},
{
"id": 202,
"epoch": 72592
},
{
"id": 203,
"epoch": 72592
},
{
"id": 204,
"epoch": 72592
},
{
"id": 205,
"epoch": 72592
},
{
"id": 206,
"epoch": 72592
},
{
"id": 207,
"epoch": 72592
},
{
"id": 208,
"epoch": 72592
},
{
"id": 209,
"epoch": 72592
},
{
"id": 210,
"epoch": 72592
},
{
"id": 211,
"epoch": 72592
},
{
"id": 212,
"epoch": 72592
},
{
"id": 213,
"epoch": 72592
},
{
"id": 214,
"epoch": 72592
},
{
"id": 215,
"epoch": 72592
},
{
"id": 216,
"epoch": 72592
},
{
"id": 217,
"epoch": 72592
},
{
"id": 218,
"epoch": 72592
},
{
"id": 219,
"epoch": 72592
},
{
"id": 220,
"epoch": 72592
},
{
"id": 221,
"epoch": 72592
},
{
"id": 222,
"epoch": 72592
},
{
"id": 223,
"epoch": 72592
},
{
"id": 224,
"epoch": 72592
},
{
"id": 225,
"epoch": 72592
},
{
"id": 226,
"epoch": 72592
},
{
"id": 227,
"epoch": 72592
},
{
"id": 228,
"epoch": 72592
},
{
"id": 229,
"epoch": 72592
},
{
"id": 230,
"epoch": 72592
},
{
"id": 231,
"epoch": 72592
},
{
"id": 232,
"epoch": 72592
},
{
"id": 233,
"epoch": 72592
},
{
"id": 234,
"epoch": 72592
},
{
"id": 235,
"epoch": 72592
},
{
"id": 236,
"epoch": 72592
},
{
"id": 237,
"epoch": 72592
},
{
"id": 238,
"epoch": 72592
},
{
"id": 239,
"epoch": 72592
},
{
"id": 240,
"epoch": 72592
},
{
"id": 241,
"epoch": 72592
},
{
"id": 242,
"epoch": 72592
},
{
"id": 243,
"epoch": 72592
},
{
"id": 244,
"epoch": 72592
},
{
"id": 245,
"epoch": 72592
},
{
"id": 246,
"epoch": 72592
},
{
"id": 247,
"epoch": 72592
},
{
"id": 248,
"epoch": 72592
},
{
"id": 249,
"epoch": 72592
},
{
"id": 250,
"epoch": 72592
},
{
"id": 251,
"epoch": 72592
},
{
"id": 252,
"epoch": 72592
},
{
"id": 253,
"epoch": 72592
},
{
"id": 254,
"epoch": 72592
},
{
"id": 255,
"epoch": 72592
},
{
"id": 256,
"epoch": 72592
},
{
"id": 257,
"epoch": 72592
},
{
"id": 258,
"epoch": 72592
},
{
"id": 259,
"epoch": 72592
},
{
"id": 260,
"epoch": 72592
},
{
"id": 261,
"epoch": 72592
},
{
"id": 262,
"epoch": 72592
},
{
"id": 263,
"epoch": 72592
},
{
"id": 264,
"epoch": 72592
},
{
"id": 265,
"epoch": 72592
},
{
"id": 266,
"epoch": 72592
},
{
"id": 267,
"epoch": 72592
},
{
"id": 268,
"epoch": 72592
},
{
"id": 269,
"epoch": 72592
},
{
"id": 270,
"epoch": 72592
},
{
"id": 271,
"epoch": 72592
},
{
"id": 272,
"epoch": 72592
},
{
"id": 273,
"epoch": 72592
},
{
"id": 274,
"epoch": 72592
},
{
"id": 275,
"epoch": 72592
},
{
"id": 276,
"epoch": 72592
},
{
"id": 277,
"epoch": 72592
},
{
"id": 278,
"epoch": 72592
},
{
"id": 279,
"epoch": 72592
},
{
"id": 280,
"epoch": 72592
},
{
"id": 281,
"epoch": 72592
},
{
"id": 282,
"epoch": 72592
},
{
"id": 283,
"epoch": 72592
},
{
"id": 284,
"epoch": 72592
}
]
}
root@monb01:~#
W dniu 2020-11-12 11:58, Dan van der Ster napisał(a):
> I found another possible trimming bug this morning, but I don't expect
> it applies to you because you said you restarted the mon leader:
>
https://tracker.ceph.com/issues/48212
>
> Otherwise, couple you please share the output of
>
> ceph report | grep committed
> ceph report | jq .osdmap_clean_epochs
>
> Thanks,
>
> Dan
>
> On Thu, Nov 12, 2020 at 10:56 AM <m.sliwinski(a)lh.pl> wrote:
>>
>> Hi
>>
>> Thanks for the response. Our cluster is currently mostly on 14.2.13,
>> especially all MONs and MGRs are.
>> Some OSDs are still on 14.2.9, but it shouldn't block osdmap trimming
>> i
>> think, because atm we don't have any down OSDs, i checked for that
>> when
>> we first noticed the issue.
>> I'm working of course on bringing all OSDs to 14.2.13, but it will
>> take
>> some time as i have to create and test Debian packages for that.
>> Could there be any other reason? I found posts about PGs for new pool
>> not bein marked as created in MON db while cluster still reporting
>> everything as active+clean, but i dont know how to debug that.
>>
>> --
>> Best regards
>> Marcin
>>
>>
>> W dniu 2020-11-11 16:50, Dan van der Ster napisał(a):
>> > Hi,
>> >
>> > v14.2.13 has an important fix in this area:
>> >
https://tracker.ceph.com/issues/47290
>> > Without this fix, your cluster will not trim if there are any *down*
>> > osds in the cluster.
>> >
>> > On our clusters we are running v14.2.11 patched with commit
>> > "mon/OSDMonitor: only take in osd into consideration when trimming
>> > osdmaps" -- this trims maps perfectly afaict.
>> >
>> > I can't vouch for the rest of 14.2.13, so better test that adequately
>> > before upgrading.
>> >
>> > Cheers, Dan
>> >
>> >
>> > On Tue, Nov 10, 2020 at 6:57 PM <m.sliwinski(a)lh.pl> wrote:
>> >>
>> >> Hi
>> >>
>> >> We have ceph cluster running on Nautilus, recently upgraded from
>> >> Mimic.
>> >> When in Mimic we noticed issue with osdmap not trimming, which caused
>> >> part of our cluster to crash due to osdmap cache misses. We solved it
>> >> by
>> >> adding "osd_map_cache_size = 5000" to our ceph.conf
>> >> Because we had at that time mixed OSD versions from both Mimic and
>> >> Nautilus we decided to finish upgrade, but it didn't solve our
>> >> problem.
>> >> We have at the moment: "oldest_map": 67114,
"newest_map": 72588,and
>> >> the
>> >> difference is not shrinking even thought cluster is in active+clean
>> >> state. Restarting all mon's didn't help. It seems bug is similar
to
>> >>
https://tracker.ceph.com/issues/44184 but there's no solution
there.
>> >> What else can i check or do?
>> >> I don't want do to cangerous things like mon_osd_force_trim_to or
>> >> something similar without finding the cause.
>> >>
>> >> I noticed in MON debug log:
>> >>
>> >> 2020-11-10 17:11:14.612 7f9592d5b700 10 mon.monb01(a)0(leader).osd
>> >> e72571
>> >> should_prune could only prune 4957 epochs (67114..72071), which is
>> >> less
>> >> than the required minimum (10000)
>> >> 2020-11-10 17:11:19.612 7f9592d5b700 10 mon.monb01(a)0(leader).osd
>> >> e72571
>> >> should_prune could only prune 4957 epochs (67114..72071), which is
>> >> less
>> >> than the required minimum (10000)
>> >>
>> >> So i added config options to reduce those values:
>> >>
>> >> mon dev mon_debug_block_osdmap_trim false
>> >> mon advanced mon_min_osdmap_epochs 100
>> >> mon advanced mon_osdmap_full_prune_min 500
>> >> mon advanced paxos_service_trim_min 10
>> >>
>> >> But it didn't help:
>> >>
>> >> 2020-11-10 18:28:26.165 7f1b700ab700 20 mon.monb01(a)0(leader).osd
>> >> e72588
>> >> load_osdmap_manifest osdmap manifest detected in store; reload.
>> >> 2020-11-10 18:28:26.169 7f1b700ab700 10 mon.monb01(a)0(leader).osd
>> >> e72588
>> >> load_osdmap_manifest store osdmap manifest pinned (67114 .. 72484)
>> >> 2020-11-10 18:28:26.169 7f1b700ab700 10 mon.monb01(a)0(leader).osd
>> >> e72588
>> >> should_prune not enough epochs to form an interval (last pinned:
>> >> 72484,
>> >> last to pin: 72488, interval: 10)
>> >>
>> >> Command "ceph report | jq '.osdmap_manifest' |jq
'.pinned_maps[]'"
>> >> shows
>> >> 67114 on the top, but i'm unable to determine why.
>> >>
>> >> Same with 'ceph report | jq .osdmap_first_committed':
>> >>
>> >> root@monb01:/var/log/ceph# ceph report | jq .osdmap_first_committed
>> >> report 4073203295
>> >> 67114
>> >> root@monb01:/var/log/ceph#
>> >>
>> >> When i try to derermine if a certain PG or OSD is keeping it so low i
>> >> don't get anything.
>> >>
>> >> And in MON debug log i get:
>> >>
>> >> 2020-11-10 18:42:41.767 7f1b74721700 10 mon.monb01@0(leader) e6
>> >> refresh_from_paxos
>> >> 2020-11-10 18:42:41.767 7f1b74721700 10
>> >> mon.monb01(a)0(leader).paxosservice(mdsmap 1..1) refresh
>> >> 2020-11-10 18:42:41.767 7f1b74721700 10
>> >> mon.monb01(a)0(leader).paxosservice(osdmap 67114..72588) refresh
>> >> 2020-11-10 18:42:41.767 7f1b74721700 20 mon.monb01(a)0(leader).osd
>> >> e72588
>> >> load_osdmap_manifest osdmap manifest detected in store; reload.
>> >> 2020-11-10 18:42:41.767 7f1b74721700 10 mon.monb01(a)0(leader).osd
>> >> e72588
>> >> load_osdmap_manifest store osdmap manifest pinned (67114 .. 72484)
>> >>
>> >> I also get:
>> >>
>> >> root@monb01:/var/log/ceph# ceph report |grep
"min_last_epoch_clean"
>> >> report 2716976759
>> >> "min_last_epoch_clean": 0,
>> >> root@monb01:/var/log/ceph#
>> >>
>> >>
>> >> Additional info:
>> >> root@monb01:/var/log/ceph# ceph versions
>> >> {
>> >> "mon": {
>> >> "ceph version 14.2.13
>> >> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> >> nautilus (stable)": 3
>> >> },
>> >> "mgr": {
>> >> "ceph version 14.2.13
>> >> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> >> nautilus (stable)": 3
>> >> },
>> >> "osd": {
>> >> "ceph version 14.2.13
>> >> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> >> nautilus (stable)": 120,
>> >> "ceph version 14.2.9
>> >> (581f22da52345dba46ee232b73b990f06029a2a0)
>> >> nautilus (stable)": 164
>> >> },
>> >> "mds": {},
>> >> "overall": {
>> >> "ceph version 14.2.13
>> >> (1778d63e55dbff6cedb071ab7d367f8f52a8699f)
>> >> nautilus (stable)": 126,
>> >> "ceph version 14.2.9
>> >> (581f22da52345dba46ee232b73b990f06029a2a0)
>> >> nautilus (stable)": 164
>> >> }
>> >> }
>> >>
>> >>
>> >> root@monb01:/var/log/ceph# ceph mon feature ls
>> >>
>> >> all features
>> >> supported: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> >> persistent: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> >> on current monmap (epoch 6)
>> >> persistent: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> >> required: [kraken,luminous,mimic,osdmap-prune,nautilus]
>> >>
>> >>
>> >> root@monb01:/var/log/ceph# ceph osd dump | grep require
>> >> require_min_compat_client luminous
>> >> require_osd_release nautilus
>> >>
>> >>
>> >> root@monb01:/var/log/ceph# ceph report | jq
>> >> '.osdmap_manifest.pinned_maps | length'
>> >> report 1777129876
>> >> 538
>> >>
>> >> root@monb01:/var/log/ceph# ceph pg dump -f json | jq .osd_epochs
>> >> dumped all
>> >> null
>> >>
>> >> --
>> >> Best regards
>> >> Marcin
>> >> _______________________________________________
>> >> ceph-users mailing list -- ceph-users(a)ceph.io
>> >> To unsubscribe send an email to ceph-users-leave(a)ceph.io