Failing to create monitor in a working cluster. - ceph-users

31 Mar 2023

Hello, i've been running a 3 node proxmox cluster with 4 ceph osd for 3 years as a
production cluster.
As a test for trying to move ceph cluster network, i destroyed one of the 3 working
monitors and tried to recreate it.
After destroying it, the new monitor refuses to join the cluster, even in the old network.
I've tried all steps in documentation "Troubleshooting monitors" section.

New monitor has this config extracted from ceph --admin-daemon file.asok 
{
    "name": "n3ceph",
    "rank": -1,
    "state": "probing",
    "election_epoch": 0,
    "quorum": [],
    "features": {
        "required_con": "2449958197560098820",
        "required_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ],
        "quorum_con": "0",
        "quorum_mon": []
    },
    "outside_quorum": [],
    "extra_probe_peers": [],
    "sync_provider": [],
    "monmap": {
        "epoch": 6,
        "fsid": "5e60d0bb-33b4-42db-bbe7-7032c35ee605",
        "modified": "2023-03-31T11:54:44.616569Z",
        "created": "2019-12-02T13:50:38.097448Z",
        "min_mon_release": 16,
        "min_mon_release_name": "pacific",
        "election_strategy": 1,
        "disallowed_leaders: ": "",
        "stretch_mode": false,
        "tiebreaker_mon": "",
        "removed_ranks: ": "1",
        "features": {
            "persistent": [
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune",
                "nautilus",
                "octopus",
                "pacific",
                "elector-pinging"
            ],
            "optional": []
        },
        "mons": [
            {
                "rank": 0,
                "name": "node1",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.1:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.1:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.1:6789/0",
                "public_addr": "10.100.100.1:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            },
            {
                "rank": 1,
                "name": "node2",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.2:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.2:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.2:6789/0",
                "public_addr": "10.100.100.2:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            }
        ]
    },
    "feature_map": {
        "mon": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ]
    },
    "stretch_mode": false
}

The quorum mon stat is as follows:
{
    "name": "node1",
    "rank": 0,
    "state": "leader",
    "election_epoch": 340,
    "quorum": [
        0,
        1
    ],
    "quorum_age": 13090,
    "features": {
        "required_con": "2449958747317026820",
        "required_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ],
        "quorum_con": "4540138314316775423",
        "quorum_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ]
    },
    "outside_quorum": [],
    "extra_probe_peers": [],
    "sync_provider": [],
    "monmap": {
        "epoch": 6,
        "fsid": "5e60d0bb-33b4-42db-bbe7-7032c35ee605",
        "modified": "2023-03-31T11:54:44.616569Z",
        "created": "2019-12-02T13:50:38.097448Z",
        "min_mon_release": 16,
        "min_mon_release_name": "pacific",
        "election_strategy": 1,
        "disallowed_leaders: ": "",
        "stretch_mode": false,
        "tiebreaker_mon": "",
        "removed_ranks: ": "1",
        "features": {
            "persistent": [
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune",
                "nautilus",
                "octopus",
                "pacific",
                "elector-pinging"
            ],
            "optional": []
        },
        "mons": [
            {
                "rank": 0,
                "name": "node1",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.1:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.1:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.1:6789/0",
                "public_addr": "10.100.100.1:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            },
            {
                "rank": 1,
                "name": "node2",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.2:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.2:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.2:6789/0",
                "public_addr": "10.100.100.2:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            }
        ]
    },
    "feature_map": {
        "mon": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ],
        "osd": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 5
            }
        ],
        "client": [
            {
                "features": "0x2f018fb87aa4aafe",
                "release": "luminous",
                "num": 1
            },
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 12
            }
        ],
        "mgr": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ]
    },
    "stretch_mode": false

I tried to get a debug log with ceph daemon mon.n3ceph config set debug_mon 10/10
 and restarting the service, but the ceph log file stoped working after i tried that
setting.

journalctl -u tells me:
mar 31 17:35:22 node3 ceph-mon[240916]: 2023-03-31T17:35:22.926+0200 7f49e0699700 -1
mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow ops, oldest is log(1 entries
from seq 1 at 2023-03-31T17:30:19.347379+0200)
mar 31 17:35:27 node3 ceph-mon[240916]: 2023-03-31T17:35:27.926+0200 7f49e0699700 -1
mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow ops, oldest is log(1 entries
from seq 1 at 2023-03-31T17:30:19.347379+0200)
mar 31 17:35:32 node3 ceph-mon[240916]: 2023-03-31T17:35:32.926+0200 7f49e0699700 -1
mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow ops, oldest is log(1 entries
from seq 1 at 2023-03-31T17:30:19.347379+0200).

Any ideas? Cluster is running fine with two monitors, but a reboot in one of the nodes
might be a big problem.
Kind regards and many thanks.