We have been evaluating other cluster storage solutions and one of them is just about as fast as Ceph, but only uses FUSE. They mentioned that recent improvement in the FUSE code allows for similar performance to kernel code. So, I'm doing some tests between CephFS kernel and FUSE and that is not true in the Ceph case.

It seems that there is a lot of time spent in locks and polls. I'm wondering if this was needed to be done in the past to get around some deficiencies in FUSE, but are no longer needed. I don't know enough about FUSE to figure it out on my own.

This is a very parallel workload running during these samples.

Running `perf top`, I'm seeing:
```
 16.90%  [kernel]                  [k] do_sys_poll
 16.68%  libopen-pal.so.20.10.1    [.] 0x0000000000082091
 12.21%  [kernel]                  [k] __fget
  8.36%  [kernel]                  [k] fput
  7.01%  [kernel]                  [k] tcp_poll
  2.94%  [kernel]                  [k] sock_poll
  1.96%  [vdso]                    [.] 0x0000000000000977
  1.92%  [kernel]                  [k] syscall_return_via_sysret
  1.58%  [kernel]                  [k] tcp_stream_memory_free
```
Annotating the do_sys_poll, I get
```
  0.09 │     → callq  poll_freewait
 0.09 │       mov    -0x3d8(%rbp),%rcx
      │       lea    -0x3b0(%rbp),%rsi
      │       xor    %r8d,%r8d
 0.00 │3f3:   mov    0x8(%rsi),%eax
 0.09 │       lea    0xc(%rsi),%r9
 0.00 │       test   %eax,%eax
      │     ↓ jle    4ce
      │       xor    %edx,%edx
      │     ↓ jmp    416
 2.03 406:   add    $0x1,%edx
 2.02       add    $0x8,%rcx
 6.33       cmp    %edx,0x8(%rsi)
 0.19 │     ↓ jle    4ce
 0.09 │416:   movslq %edx,%rax
 1.99       movzwl 0x6(%r9,%rax,8),%edi
22.59       stac
 2.01       mov    %r8d,%eax
 8.88       mov    %di,0x6(%rcx)
26.62       clac
 0.00 │       test   %eax,%eax
 2.12     ↑ je     406
      │430:   mov    $0xfffffff2,%r13d
 0.00 │436:   mov    -0x3b0(%rbp),%rdi
      │       test   %rdi,%rdi
 0.09 │     ↓ je     452
      │442:   mov    (%rdi),%rbx
      │     → callq  kfree
      │       test   %rbx,%rbx
      │       mov    %rbx,%rdi
      │     ↑ jne    442
```
The libopen-pal.so.20.10.1 doesn't provide much info (because I'm not sure how to load the symbols)
```
  15.03%  [.] 0x0000000000082091
   0.62%  [.] 0x0000000000082093
   0.59%  [.] opal_libevent2022_event_base_loop
   0.50%  [.] 0x00000000000820a0
   0.47%  [.] opal_progress
   0.07%  [.] 0x000000000006e41b
   0.07%  [.] opal_libevent2022_evutil_tv_to_msec
```
And in __fget
```
  2.52      sbb    %rax,%rax
 0.12 │      mov    0x8(%rdx),%rdx
 0.14 │      and    %edi,%eax
 0.21 │      lea    (%rdx,%rax,8),%rax
 5.45      mov    (%rax),%rdx
 0.45 │      test   %rdx,%rdx
      │    ↓ je     5c
19.25      test   %esi,0x44(%rdx)
      │    ↓ jne    76
 3.15      mov    0x38(%rdx),%rax
 2.33      test   %rax,%rax
      │    ↑ je     1c
 0.00 │      lea    0x1(%rax),%rcx
 0.19 │      lea    0x38(%rdx),%r10
58.09      lock   cmpxchg %rcx,0x38(%rdx)
 0.02 │    ↓ jne    61
 2.31 5c:   mov    %rdx,%rax
 0.00 │      pop    %rbp
 0.00 │    ← retq
```

Thank you,
----------------
Robert LeBlanc
PGP Fingerprint 79A2 9CA4 6CC4 45DD A904  C70E E654 3BB2 FA62 B9F1